diff --git a/IPython/zmq/parallel/controller.py b/IPython/zmq/parallel/controller.py index 1f0507f..afbcffd 100644 --- a/IPython/zmq/parallel/controller.py +++ b/IPython/zmq/parallel/controller.py @@ -1,6 +1,7 @@ #!/usr/bin/env python """The IPython Controller with 0MQ -This is the master object that handles connections from engines, clients, and +This is the master object that handles connections from engines and clients, +and monitors traffic through the various queues. """ #----------------------------------------------------------------------------- # Copyright (C) 2010 The IPython Development Team @@ -28,12 +29,14 @@ from IPython.zmq.entry_point import bind_port from streamsession import Message, wrap_exception from entry_point import (make_base_argument_parser, select_random_ports, split_ports, connect_logger, parse_url) -# from messages import json # use the same import switches #----------------------------------------------------------------------------- # Code #----------------------------------------------------------------------------- +def _passer(*args, **kwargs): + return + class ReverseDict(dict): """simple double-keyed subset of dict methods.""" @@ -93,16 +96,18 @@ class Controller(object): loop: zmq IOLoop instance session: StreamSession object context: zmq context for creating new connections (?) + queue: ZMQStream for monitoring the command queue (SUB) registrar: ZMQStream for engine registration requests (XREP) + heartbeat: HeartMonitor object checking the pulse of the engines clientele: ZMQStream for client connections (XREP) not used for jobs, only query/control commands - queue: ZMQStream for monitoring the command queue (SUB) - heartbeat: HeartMonitor object checking the pulse of the engines - db_stream: connection to db for out of memory logging of commands + notifier: ZMQStream for broadcasting engine registration changes (PUB) + db: connection to db for out of memory logging of commands NotImplemented - queue_addr: zmq connection address of the XREP socket for the queue - hb_addr: zmq connection address of the PUB socket for heartbeats - task_addr: zmq connection address of the XREQ socket for task queue + engine_addrs: dict of zmq connection information for engines to connect + to the queues. + client_addrs: dict of zmq connection information for engines to connect + to the queues. """ # internal data structures: ids=None # engine IDs @@ -165,14 +170,16 @@ class Controller(object): self.notifier = notifier self.db = db + # validate connection dicts: self.client_addrs = client_addrs assert isinstance(client_addrs['queue'], str) + assert isinstance(client_addrs['control'], str) # self.hb_addrs = hb_addrs self.engine_addrs = engine_addrs assert isinstance(engine_addrs['queue'], str) + assert isinstance(client_addrs['control'], str) assert len(engine_addrs['heartbeat']) == 2 - # register our callbacks self.registrar.on_recv(self.dispatch_register_request) self.clientele.on_recv(self.dispatch_client_msg) @@ -182,19 +189,25 @@ class Controller(object): heartbeat.add_heart_failure_handler(self.handle_heart_failure) heartbeat.add_new_heart_handler(self.handle_new_heart) - if self.db is not None: - self.db.on_recv(self.dispatch_db) - + self.queue_handlers = { 'in' : self.save_queue_request, + 'out': self.save_queue_result, + 'intask': self.save_task_request, + 'outtask': self.save_task_result, + 'tracktask': self.save_task_destination, + 'incontrol': _passer, + 'outcontrol': _passer, + } + self.client_handlers = {'queue_request': self.queue_status, 'result_request': self.get_results, 'purge_request': self.purge_results, + 'load_request': self.check_load, 'resubmit_request': self.resubmit_task, } self.registrar_handlers = {'registration_request' : self.register_engine, 'unregistration_request' : self.unregister_engine, 'connection_request': self.connection_request, - } # # this is the stuff that will move to DB: @@ -272,7 +285,7 @@ class Controller(object): print (idents,msg, len(msg)) try: msg = self.session.unpack_message(msg,content=True) - except Exception, e: + except Exception as e: logger.error("registration::got bad registration message: %s"%msg) raise e return @@ -291,18 +304,9 @@ class Controller(object): logger.debug("queue traffic: %s"%msg[:2]) switch = msg[0] idents, msg = self.session.feed_identities(msg[1:]) - if switch == 'in': - self.save_queue_request(idents, msg) - elif switch == 'out': - self.save_queue_result(idents, msg) - elif switch == 'intask': - self.save_task_request(idents, msg) - elif switch == 'outtask': - self.save_task_result(idents, msg) - elif switch == 'tracktask': - self.save_task_destination(idents, msg) - elif switch in ('incontrol', 'outcontrol'): - pass + handler = self.queue_handlers.get(switch, None) + if handler is not None: + handler(idents, msg) else: logger.error("Invalid message topic: %s"%switch) @@ -392,7 +396,7 @@ class Controller(object): received=None, engine=(eid, queue_id)) self.pending[msg_id] = ( msg, info ) - self.queues[eid][0].append(msg_id) + self.queues[eid].append(msg_id) def save_queue_result(self, idents, msg): client_id, queue_id = idents[:2] @@ -417,7 +421,7 @@ class Controller(object): self.results[msg_id] = msg if msg_id in self.pending: self.pending.pop(msg_id) - self.queues[eid][0].remove(msg_id) + self.queues[eid].remove(msg_id) self.completed[eid].append(msg_id) else: logger.debug("queue:: unknown msg finished %s"%msg_id) @@ -425,6 +429,7 @@ class Controller(object): #--------------------- Task Queue Traffic ------------------------------ def save_task_request(self, idents, msg): + """Save the submission of a task.""" client_id = idents[0] try: @@ -437,13 +442,17 @@ class Controller(object): header = msg['header'] msg_id = header['msg_id'] self.mia.add(msg_id) - self.pending[msg_id] = msg + info = dict(submit=datetime.now(), + received=None, + engine=None) + self.pending[msg_id] = (msg, info) if not self.tasks.has_key(client_id): self.tasks[client_id] = [] self.tasks[client_id].append(msg_id) def save_task_result(self, idents, msg): - client_id = idents[0] + """save the result of a completed task.""" + client_id, engine_uuid = idents[:2] try: msg = self.session.unpack_message(msg, content=False) except: @@ -452,16 +461,19 @@ class Controller(object): return parent = msg['parent_header'] + eid = self.by_ident[engine_uuid] if not parent: # print msg # logger.warn("") return msg_id = parent['msg_id'] self.results[msg_id] = msg - if msg_id in self.pending: + if msg_id in self.pending and msg_id in self.tasks[eid]: self.pending.pop(msg_id) if msg_id in self.mia: self.mia.remove(msg_id) + self.completed[eid].append(msg_id) + self.tasks[eid].remove(msg_id) else: logger.debug("task::unknown task %s finished"%msg_id) @@ -475,16 +487,16 @@ class Controller(object): print (content) msg_id = content['msg_id'] engine_uuid = content['engine_id'] - for eid,queue_id in self.keytable.iteritems(): - if queue_id == engine_uuid: - break + eid = self.by_ident[engine_uuid] logger.info("task::task %s arrived on %s"%(msg_id, eid)) if msg_id in self.mia: self.mia.remove(msg_id) else: logger.debug("task::task %s not listed as MIA?!"%(msg_id)) - self.tasks[engine_uuid].append(msg_id) + + self.tasks[eid].append(msg_id) + self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid)) def mia_task_request(self, idents, msg): client_id = idents[0] @@ -493,10 +505,12 @@ class Controller(object): - #-------------------- Registration ----------------------------- + #------------------------------------------------------------------------- + # Registration requests + #------------------------------------------------------------------------- def connection_request(self, client_id, msg): - """reply with connection addresses for clients""" + """Reply with connection addresses for clients.""" logger.info("client::client %s connected"%client_id) content = dict(status='ok') content.update(self.client_addrs) @@ -507,7 +521,7 @@ class Controller(object): self.session.send(self.registrar, 'connection_reply', content, parent=msg, ident=client_id) def register_engine(self, reg, msg): - """register an engine""" + """Register a new engine.""" content = msg['content'] try: queue = content['queue'] @@ -556,6 +570,7 @@ class Controller(object): return eid def unregister_engine(self, ident, msg): + """Unregister an engine that explicitly requested to leave.""" try: eid = msg['content']['id'] except: @@ -569,7 +584,7 @@ class Controller(object): self.hearts.pop(ec.heartbeat) self.by_ident.pop(ec.queue) self.completed.pop(eid) - for msg_id in self.queues.pop(eid)[0]: + for msg_id in self.queues.pop(eid): msg = self.pending.pop(msg_id) ############## TODO: HANDLE IT ################ @@ -577,6 +592,8 @@ class Controller(object): self.session.send(self.notifier, "unregistration_notification", content=content) def finish_registration(self, heart): + """Second half of engine registration, called after our HeartMonitor + has received a beat from the Engine's Heart.""" try: (eid,queue,reg,purge) = self.incoming_registrations.pop(heart) except KeyError: @@ -590,7 +607,8 @@ class Controller(object): self.keytable[eid] = queue self.engines[eid] = EngineConnector(eid, queue, reg, control, heart) self.by_ident[queue] = eid - self.queues[eid] = ([],[]) + self.queues[eid] = list() + self.tasks[eid] = list() self.completed[eid] = list() self.hearts[heart] = eid content = dict(id=eid, queue=self.engines[eid].queue) @@ -604,7 +622,9 @@ class Controller(object): else: pass - #------------------- Client Requests ------------------------------- + #------------------------------------------------------------------------- + # Client Requests + #------------------------------------------------------------------------- def check_load(self, client_id, msg): content = msg['content'] @@ -620,12 +640,17 @@ class Controller(object): content = dict(status='ok') # loads = {} for t in targets: - content[str(t)] = len(self.queues[t]) + content[bytes(t)] = len(self.queues[t])+len(self.tasks[t]) self.session.send(self.clientele, "load_reply", content=content, ident=client_id) def queue_status(self, client_id, msg): - """handle queue_status request""" + """Return the Queue status of one or more targets. + if verbose: return the msg_ids + else: return len of each type. + keys: queue (pending MUX jobs) + tasks (pending Task jobs) + completed (finished jobs from both queues)""" content = msg['content'] targets = content['targets'] try: @@ -635,19 +660,23 @@ class Controller(object): self.session.send(self.clientele, "controller_error", content=content, ident=client_id) return - verbose = msg.get('verbose', False) - content = dict() + verbose = content.get('verbose', False) + content = dict(status='ok') for t in targets: queue = self.queues[t] completed = self.completed[t] + tasks = self.tasks[t] if not verbose: queue = len(queue) completed = len(completed) - content[str(t)] = {'queue': queue, 'completed': completed } + tasks = len(tasks) + content[bytes(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks} # pending self.session.send(self.clientele, "queue_reply", content=content, ident=client_id) def purge_results(self, client_id, msg): + """Purge results from memory. This method is more valuable before we move + to a DB based message storage mechanism.""" content = msg['content'] msg_ids = content.get('msg_ids', []) reply = dict(status='ok') @@ -675,37 +704,11 @@ class Controller(object): self.sesison.send(self.clientele, 'purge_reply', content=reply, ident=client_id) def resubmit_task(self, client_id, msg, buffers): - content = msg['content'] - header = msg['header'] - - - msg_ids = content.get('msg_ids', []) - reply = dict(status='ok') - if msg_ids == 'all': - self.results = {} - else: - for msg_id in msg_ids: - if msg_id in self.results: - self.results.pop(msg_id) - else: - if msg_id in self.pending: - reply = dict(status='error', reason="msg pending: %r"%msg_id) - else: - reply = dict(status='error', reason="No such msg: %r"%msg_id) - break - eids = content.get('engine_ids', []) - for eid in eids: - if eid not in self.engines: - reply = dict(status='error', reason="No such engine: %i"%eid) - break - msg_ids = self.completed.pop(eid) - for msg_id in msg_ids: - self.results.pop(msg_id) - - self.sesison.send(self.clientele, 'purge_reply', content=reply, ident=client_id) + """Resubmit a task.""" + raise NotImplementedError def get_results(self, client_id, msg): - """get the result of 1 or more messages""" + """Get the result of 1 or more messages.""" content = msg['content'] msg_ids = set(content['msg_ids']) statusonly = content.get('status_only', False) @@ -727,33 +730,12 @@ class Controller(object): break self.session.send(self.clientele, "result_reply", content=content, parent=msg, ident=client_id) - -############ OLD METHODS for Python Relay Controller ################### - def _validate_engine_msg(self, msg): - """validates and unpacks headers of a message. Returns False if invalid, - (ident, message)""" - ident = msg[0] - try: - msg = self.session.unpack_message(msg[1:], content=False) - except: - logger.error("engine.%s::Invalid Message %s"%(ident, msg)) - return False - - try: - eid = msg.header.username - assert self.engines.has_key(eid) - except: - logger.error("engine::Invalid Engine ID %s"%(ident)) - return False - - return eid, msg - - -#-------------------- +#------------------------------------------------------------------------- # Entry Point -#-------------------- +#------------------------------------------------------------------------- + def make_argument_parser(): """Make an argument parser""" parser = make_base_argument_parser() diff --git a/IPython/zmq/parallel/heartmonitor.py b/IPython/zmq/parallel/heartmonitor.py index 5ca7f64..eab3ffa 100644 --- a/IPython/zmq/parallel/heartmonitor.py +++ b/IPython/zmq/parallel/heartmonitor.py @@ -130,7 +130,7 @@ class HeartMonitor(object): for handler in self._failure_handlers: try: handler(heart) - except Exception, e: + except Exception as e: print (e) logger.error("heartbeat::Bad Handler! %s"%handler) pass diff --git a/IPython/zmq/parallel/scheduler.py b/IPython/zmq/parallel/scheduler.py index 7601e8e..0b17552 100644 --- a/IPython/zmq/parallel/scheduler.py +++ b/IPython/zmq/parallel/scheduler.py @@ -1,3 +1,10 @@ +"""The Python scheduler for rich scheduling. + +The Pure ZMQ scheduler does not allow routing schemes other than LRU, +nor does it check msg_id DAG dependencies. For those, a slightly slower +Python Scheduler exists. +""" + #---------------------------------------------------------------------- # Imports #---------------------------------------------------------------------- @@ -40,7 +47,7 @@ def plainrandom(loads): def lru(loads): """Always pick the front of the line. - The content of loads is ignored. + The content of `loads` is ignored. Assumes LRU ordering of loads, with oldest first. """ @@ -151,10 +158,12 @@ class TaskScheduler(object): self.notifier_stream.on_recv(self.dispatch_notification) def resume_receiving(self): - """resume accepting jobs""" + """Resume accepting jobs.""" self.client_stream.on_recv(self.dispatch_submission, copy=False) def stop_receiving(self): + """Stop accepting jobs while there are no engines. + Leave them in the ZMQ queue.""" self.client_stream.on_recv(None) #----------------------------------------------------------------------- @@ -176,7 +185,7 @@ class TaskScheduler(object): logger.error("task::Invalid notification msg: %s"%msg) @logged def _register_engine(self, uid): - """new engine became available""" + """New engine with ident `uid` became available.""" # head of the line: self.targets.insert(0,uid) self.loads.insert(0,0) @@ -187,10 +196,12 @@ class TaskScheduler(object): self.resume_receiving() def _unregister_engine(self, uid): - """existing engine became unavailable""" - # handle any potentially finished tasks: + """Existing engine with ident `uid` became unavailable.""" if len(self.targets) == 1: + # this was our only engine self.stop_receiving() + + # handle any potentially finished tasks: self.engine_stream.flush() self.completed.pop(uid) @@ -203,7 +214,7 @@ class TaskScheduler(object): self.handle_stranded_tasks(lost) def handle_stranded_tasks(self, lost): - """deal with jobs resident in an engine that died.""" + """Deal with jobs resident in an engine that died.""" # TODO: resubmit the tasks? for msg_id in lost: pass @@ -214,26 +225,29 @@ class TaskScheduler(object): #----------------------------------------------------------------------- @logged def dispatch_submission(self, raw_msg): - """dispatch job submission""" + """Dispatch job submission to appropriate handlers.""" # ensure targets up to date: self.notifier_stream.flush() try: idents, msg = self.session.feed_identities(raw_msg, copy=False) - except Exception, e: + except Exception as e: logger.error("task::Invaid msg: %s"%msg) return msg = self.session.unpack_message(msg, content=False, copy=False) header = msg['header'] msg_id = header['msg_id'] + + # time dependencies after = Dependency(header.get('after', [])) if after.mode == 'all': after.difference_update(self.all_done) if after.check(self.all_done): - # recast as empty set, if we are already met, - # to prevent + # recast as empty set, if `after` already met, + # to prevent unnecessary set comparisons after = Dependency([]) + # location dependencies follow = Dependency(header.get('follow', [])) if len(after) == 0: # time deps already met, try to run @@ -244,6 +258,7 @@ class TaskScheduler(object): self.save_unmet(msg_id, raw_msg, after, follow) # send to monitor self.mon_stream.send_multipart(['intask']+raw_msg, copy=False) + @logged def maybe_run(self, msg_id, raw_msg, follow=None): """check location dependencies, and run if they are met.""" @@ -276,7 +291,7 @@ class TaskScheduler(object): @logged def submit_task(self, msg_id, msg, follow=None, indices=None): - """submit a task to any of a subset of our targets""" + """Submit a task to any of a subset of our targets.""" if indices: loads = [self.loads[i] for i in indices] else: @@ -290,6 +305,8 @@ class TaskScheduler(object): self.engine_stream.send_multipart(msg, copy=False) self.add_job(idx) self.pending[target][msg_id] = (msg, follow) + content = dict(msg_id=msg_id, engine_id=target) + self.session.send(self.mon_stream, 'task_destination', content=content, ident='tracktask') #----------------------------------------------------------------------- # Result Handling @@ -298,7 +315,7 @@ class TaskScheduler(object): def dispatch_result(self, raw_msg): try: idents,msg = self.session.feed_identities(raw_msg, copy=False) - except Exception, e: + except Exception as e: logger.error("task::Invaid result: %s"%msg) return msg = self.session.unpack_message(msg, content=False, copy=False) diff --git a/IPython/zmq/parallel/streamkernel.py b/IPython/zmq/parallel/streamkernel.py index 15475e2..cebeff9 100755 --- a/IPython/zmq/parallel/streamkernel.py +++ b/IPython/zmq/parallel/streamkernel.py @@ -125,7 +125,7 @@ class RawInput(object): while True: try: reply = self.socket.recv_json(zmq.NOBLOCK) - except zmq.ZMQError, e: + except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: pass else: @@ -171,7 +171,7 @@ class Kernel(object): while True: try: msg = self.session.recv(stream, zmq.NOBLOCK,content=True) - except zmq.ZMQError, e: + except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: break else: @@ -238,17 +238,6 @@ class Kernel(object): else: handler(self.control_stream, idents, msg) - # def flush_control(self): - # while any(zmq.select([self.control_socket],[],[],1e-4)): - # try: - # msg = self.control_socket.recv_multipart(zmq.NOBLOCK, copy=False) - # except zmq.ZMQError, e: - # if e.errno != zmq.EAGAIN: - # raise e - # return - # else: - # self.dispatch_control(msg) - #-------------------- queue helpers ------------------------------ diff --git a/IPython/zmq/parallel/streamsession.py b/IPython/zmq/parallel/streamsession.py index 9061896..d1b8171 100644 --- a/IPython/zmq/parallel/streamsession.py +++ b/IPython/zmq/parallel/streamsession.py @@ -8,6 +8,7 @@ import sys import traceback import pprint import uuid +from datetime import datetime import zmq from zmq.utils import jsonapi @@ -111,6 +112,7 @@ class Message(object): def msg_header(msg_id, msg_type, username, session): + date=datetime.now().isoformat() return locals() # return { # 'msg_id' : msg_id, @@ -140,7 +142,7 @@ def extract_header(msg_or_header): return h def rekey(dikt): - """rekey a dict that has been forced to use str keys where there should be + """Rekey a dict that has been forced to use str keys where there should be ints by json. This belongs in the jsonutil added by fperez.""" for k in dikt.iterkeys(): if isinstance(k, str): @@ -162,11 +164,22 @@ def rekey(dikt): return dikt def serialize_object(obj, threshold=64e-6): - """serialize an object into a list of sendable buffers. + """Serialize an object into a list of sendable buffers. - Returns: (pmd, bufs) - where pmd is the pickled metadata wrapper, and bufs - is a list of data buffers""" + Parameters + ---------- + + obj : object + The object to be serialized + threshold : float + The threshold for not double-pickling the content. + + + Returns + ------- + ('pmd', [bufs]) : + where pmd is the pickled metadata wrapper, + bufs is a list of data buffers""" # threshold is 100 B databuffers = [] if isinstance(obj, (list, tuple)): @@ -318,6 +331,8 @@ class StreamSession(object): Parameters ---------- + stream : zmq.Socket or ZMQStream + the socket-like object used to send the data msg_type : str or Message/dict Normally, msg_type will be @@ -347,10 +362,7 @@ class StreamSession(object): to_send.append(DELIM) to_send.append(self.pack(msg['header'])) to_send.append(self.pack(msg['parent_header'])) - # if parent is None: - # to_send.append(self.none) - # else: - # to_send.append(self.pack(dict(parent))) + if content is None: content = self.none elif isinstance(content, dict): @@ -374,11 +386,10 @@ class StreamSession(object): pprint.pprint(omsg) pprint.pprint(to_send) pprint.pprint(buffers) - # return both the msg object and the buffers return omsg def send_raw(self, stream, msg, flags=0, copy=True, idents=None): - """send a raw message via idents. + """Send a raw message via idents. Parameters ---------- @@ -399,7 +410,7 @@ class StreamSession(object): socket = socket.socket try: msg = socket.recv_multipart(mode) - except zmq.ZMQError, e: + except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: # We can convert EAGAIN to None as we know in this case # recv_json won't return None. @@ -412,7 +423,7 @@ class StreamSession(object): idents, msg = self.feed_identities(msg, copy) try: return idents, self.unpack_message(msg, content=content, copy=copy) - except Exception, e: + except Exception as e: print (idents, msg) # TODO: handle it raise e diff --git a/docs/source/development/parallel_connections.txt b/docs/source/development/parallel_connections.txt index 8cb6c9d..9e5bdce 100644 --- a/docs/source/development/parallel_connections.txt +++ b/docs/source/development/parallel_connections.txt @@ -4,18 +4,22 @@ Connection Diagrams of The IPython ZMQ Cluster ============================================== -This is a quick summary and illustration of the connections involved in the ZeroMQ based IPython cluster for parallel computing. +This is a quick summary and illustration of the connections involved in the ZeroMQ based +IPython cluster for parallel computing. All Connections =============== The Parallel Computing code is currently under development in Min RK's IPython fork_ on GitHub. -.. _fork: http://github.com/minrk/ipython +.. _fork: http://github.com/minrk/ipython/tree/newparallel -The IPython cluster consists of a Controller and one or more clients and engines. The goal of the Controller is to manage and monitor the connections and communications between the clients and the engines. +The IPython cluster consists of a Controller and one or more clients and engines. The goal +of the Controller is to manage and monitor the connections and communications between the +clients and the engines. -It is important for security/practicality reasons that all connections be inbound to the controller process. The arrows in the figures indicate the direction of the connection. +It is important for security/practicality reasons that all connections be inbound to the +controller process. The arrows in the figures indicate the direction of the connection. .. figure:: figs/allconnections.png @@ -25,8 +29,9 @@ It is important for security/practicality reasons that all connections be inboun All the connections involved in connecting one client to one engine. -The Controller consists of two ZMQ Devices - both MonitoredQueues, one for Tasks (load balanced, engine agnostic), one for Multiplexing (explicit targets), a Python device for monitoring (the Heartbeat Monitor). - +The Controller consists of two ZMQ Devices - both MonitoredQueues, one for Tasks (load +balanced, engine agnostic), one for Multiplexing (explicit targets), a Python device for +monitoring (the Heartbeat Monitor). Registration @@ -39,7 +44,10 @@ Registration Engines and Clients only need to know where the Registrar ``XREP`` is located to start connecting. -Once a controller is launched, the only information needed for connecting clients and/or engines to the controller is the IP/port of the ``XREP`` socket called the Registrar. This socket handles connections from both clients and engines, and replies with the remaining information necessary to establish the remaining connections. +Once a controller is launched, the only information needed for connecting clients and/or +engines to the controller is the IP/port of the ``XREP`` socket called the Registrar. This +socket handles connections from both clients and engines, and replies with the remaining +information necessary to establish the remaining connections. Heartbeat --------- @@ -51,25 +59,43 @@ Heartbeat The heartbeat sockets. -The heartbeat process has been described elsewhere. To summarize: the controller publishes a distinct message periodically via a ``PUB`` socket. Each engine has a ``zmq.FORWARDER`` device with a ``SUB`` socket for input, and ``XREQ`` socket for output. The ``SUB`` socket is connected to the ``PUB`` socket labeled *HB(ping)*, and the ``XREQ`` is connected to the ``XREP`` labeled *HB(pong)*. This results in the same message being relayed back to the Heartbeat Monitor with the addition of the ``XREQ`` prefix. The Heartbeat Monitor receives all the replies via an ``XREP`` socket, and identifies which hearts are still beating by the ``zmq.IDENTITY`` prefix of the ``XREQ`` sockets. +The heartbeat process has been described elsewhere. To summarize: the controller publishes +a distinct message periodically via a ``PUB`` socket. Each engine has a ``zmq.FORWARDER`` +device with a ``SUB`` socket for input, and ``XREQ`` socket for output. The ``SUB`` socket +is connected to the ``PUB`` socket labeled *HB(ping)*, and the ``XREQ`` is connected to +the ``XREP`` labeled *HB(pong)*. This results in the same message being relayed back to +the Heartbeat Monitor with the addition of the ``XREQ`` prefix. The Heartbeat Monitor +receives all the replies via an ``XREP`` socket, and identifies which hearts are still +beating by the ``zmq.IDENTITY`` prefix of the ``XREQ`` sockets. Queues ------ .. figure:: figs/queuefade.png - :width: 432px - :alt: IPython Queue connections - :align: center - - Load balanced Task queue on the left, explicitly multiplexed queue on the right. - -The controller has two MonitoredQueue devices. These devices are primarily for relaying messages between clients and engines, but the controller needs to see those messages for its own purposes. Since no Python code may exist between the two sockets in a queue, all messages sent through these queues (both directions) are also sent via a ``PUB`` socket to a monitor, which allows the Controller to monitor queue traffic without interfering with it. - -For tasks, the engine need not be specified. Messages sent to the ``XREP`` socket from the client side are assigned to an engine via ZMQ's ``XREQ`` round-robin load balancing. Engine replies are directed to specific clients via the IDENTITY of the client, which is received as a prefix at the Engine. - -For Multiplexing, ``XREP`` is used for both in and output sockets in the device. Clients must specify the destination by the ``zmq.IDENTITY`` of the ``PAIR`` socket connected to the downstream end of the device. - -At the Kernel level, both of these PAIR sockets are treated in the same way as the ``REP`` socket in the serial version (except using ZMQStreams instead of explicit sockets). + :width: 432px + :alt: IPython Queue connections + :align: center + + Load balanced Task queue on the left, explicitly multiplexed queue on the right. + +The controller has two MonitoredQueue devices. These devices are primarily for relaying +messages between clients and engines, but the controller needs to see those messages for +its own purposes. Since no Python code may exist between the two sockets in a queue, all +messages sent through these queues (both directions) are also sent via a ``PUB`` socket to +a monitor, which allows the Controller to monitor queue traffic without interfering with +it. + +For tasks, the engine need not be specified. Messages sent to the ``XREP`` socket from the +client side are assigned to an engine via ZMQ's ``XREQ`` round-robin load balancing. +Engine replies are directed to specific clients via the IDENTITY of the client, which is +received as a prefix at the Engine. + +For Multiplexing, ``XREP`` is used for both in and output sockets in the device. Clients +must specify the destination by the ``zmq.IDENTITY`` of the ``PAIR`` socket connected to +the downstream end of the device. + +At the Kernel level, both of these PAIR sockets are treated in the same way as the ``REP`` +socket in the serial version (except using ZMQStreams instead of explicit sockets). Client connections ------------------ @@ -81,7 +107,8 @@ Client connections Clients connect to an ``XREP`` socket to query the controller -The controller listens on an ``XREP`` socket for queries from clients as to queue status, and control instructions. Clients can connect to this via a PAIR socket or ``XREQ``. +The controller listens on an ``XREP`` socket for queries from clients as to queue status, +and control instructions. Clients can connect to this via a PAIR socket or ``XREQ``. .. figure:: figs/notiffade.png :width: 432px @@ -90,5 +117,8 @@ The controller listens on an ``XREP`` socket for queries from clients as to queu Engine registration events are published via a ``PUB`` socket. -The controller publishes all registration/unregistration events via a ``PUB`` socket. This allows clients to stay up to date with what engines are available by subscribing to the feed with a ``SUB`` socket. Other processes could selectively subscribe to just registration or unregistration events. +The controller publishes all registration/unregistration events via a ``PUB`` socket. This +allows clients to stay up to date with what engines are available by subscribing to the +feed with a ``SUB`` socket. Other processes could selectively subscribe to just +registration or unregistration events. diff --git a/docs/source/development/parallel_messages.txt b/docs/source/development/parallel_messages.txt index 307adf4..7755c4f 100644 --- a/docs/source/development/parallel_messages.txt +++ b/docs/source/development/parallel_messages.txt @@ -3,16 +3,20 @@ Messaging for Parallel Computing ================================ -This is an extension of the :ref:`messaging ` doc. Diagrams of the connections can be found in the :ref:`parallel connections ` doc. +This is an extension of the :ref:`messaging ` doc. Diagrams of the connections +can be found in the :ref:`parallel connections ` doc. - -ZMQ messaging is also used in the parallel computing IPython system. All messages to/from kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue device. The controller receives all messages and replies in these channels, and saves results for future use. +ZMQ messaging is also used in the parallel computing IPython system. All messages to/from +kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue +device. The controller receives all messages and replies in these channels, and saves +results for future use. The Controller -------------- -The controller is the central process of the IPython parallel computing model. It has 3 Devices: +The controller is the central process of the IPython parallel computing model. It has 3 +Devices: * Heartbeater * Multiplexed Queue @@ -29,9 +33,13 @@ and 3 sockets: Registration (``XREP``) *********************** -The first function of the Controller is to facilitate and monitor connections of clients and engines. Both client and engine registration are handled by the same socket, so only one ip/port pair is needed to connect any number of connections and clients. +The first function of the Controller is to facilitate and monitor connections of clients +and engines. Both client and engine registration are handled by the same socket, so only +one ip/port pair is needed to connect any number of connections and clients. -Engines register with the ``zmq.IDENTITY`` of their two ``XREQ`` sockets, one for the queue, which receives execute requests, and one for the heartbeat, which is used to monitor the survival of the Engine process. +Engines register with the ``zmq.IDENTITY`` of their two ``XREQ`` sockets, one for the +queue, which receives execute requests, and one for the heartbeat, which is used to +monitor the survival of the Engine process. Message type: ``registration_request``:: @@ -40,7 +48,10 @@ Message type: ``registration_request``:: 'heartbeat' : '1234-abcd-...' # the heartbeat XREQ id } -The Controller replies to an Engine's registration request with the engine's integer ID, and all the remaining connection information for connecting the heartbeat process, and kernel socket(s). The message status will be an error if the Engine requests IDs that already in use. +The Controller replies to an Engine's registration request with the engine's integer ID, +and all the remaining connection information for connecting the heartbeat process, and +kernel queue socket(s). The message status will be an error if the Engine requests IDs that +already in use. Message type: ``registration_reply``:: @@ -49,39 +60,49 @@ Message type: ``registration_reply``:: # if ok: 'id' : 0, # int, the engine id 'queue' : 'tcp://127.0.0.1:12345', # connection for engine side of the queue + 'control' : 'tcp://...', # addr for control queue 'heartbeat' : (a,b), # tuple containing two interfaces needed for heartbeat - 'task' : 'tcp...', # addr for task queue, or None if no task queue running + 'task' : 'tcp://...', # addr for task queue, or None if no task queue running # if error: 'reason' : 'queue_id already registered' } -Clients use the same socket to start their connections. Connection requests from clients need no information: +Clients use the same socket as engines to start their connections. Connection requests +from clients need no information: Message type: ``connection_request``:: content = {} -The reply to a Client registration request contains the connection information for the multiplexer and load balanced queues, as well as the address for direct controller queries. If any of these addresses is `None`, that functionality is not available. +The reply to a Client registration request contains the connection information for the +multiplexer and load balanced queues, as well as the address for direct controller +queries. If any of these addresses is `None`, that functionality is not available. Message type: ``connection_reply``:: content = { 'status' : 'ok', # or 'error' # if ok: - 'queue' : 'tcp://127.0.0.1:12345', # connection for client side of the queue + 'queue' : 'tcp://127.0.0.1:12345', # connection for client side of the MUX queue 'task' : 'tcp...', # addr for task queue, or None if no task queue running - 'controller' : 'tcp...' # addr for controller methods, like queue_request, etc. + 'query' : 'tcp...' # addr for methods to query the controller, like queue_request, etc. + 'control' : 'tcp...' # addr for control methods, like abort, etc. } Heartbeat ********* -The controller uses a heartbeat system to monitor engines, and track when they become unresponsive. As described in :ref:`messages `, and shown in :ref:`connections `. +The controller uses a heartbeat system to monitor engines, and track when they become +unresponsive. As described in :ref:`messages `, and shown in :ref:`connections +`. Notification (``PUB``) ********************** -The controller published all engine registration/unregistration events on a PUB socket. This allows clients to have up-to-date engine ID sets without polling. Registration notifications contain both the integer engine ID and the queue ID, which is necessary for sending messages via the Multiplexer Queue. +The controller published all engine registration/unregistration events on a PUB socket. +This allows clients to have up-to-date engine ID sets without polling. Registration +notifications contain both the integer engine ID and the queue ID, which is necessary for +sending messages via the Multiplexer Queue. Message type: ``registration_notification``:: @@ -100,9 +121,14 @@ Message type : ``unregistration_notification``:: Client Queries (``XREP``) ************************* -The controller monitors and logs all queue traffic, so that clients can retrieve past results or monitor pending tasks. Currently, this information resides in memory on the Controller, but will ultimately be offloaded to a database over an additional ZMQ connection. The interface should remain the same or at least similar. +The controller monitors and logs all queue traffic, so that clients can retrieve past +results or monitor pending tasks. Currently, this information resides in memory on the +Controller, but will ultimately be offloaded to a database over an additional ZMQ +connection. The interface should remain the same or at least similar. -:func:`queue_request` requests can specify multiple engines to query via the `targets` element. A verbose flag can be passed, to determine whether the result should be the list of `msg_ids` in the queue or simply the length of each list. +:func:`queue_request` requests can specify multiple engines to query via the `targets` +element. A verbose flag can be passed, to determine whether the result should be the list +of `msg_ids` in the queue or simply the length of each list. Message type: ``queue_request``:: @@ -111,7 +137,9 @@ Message type: ``queue_request``:: 'targets' : [0,3,1] # list of ints } -The content of a reply to a :func:queue_request request is a dict, keyed by the engine IDs. Note that they will be the string representation of the integer keys, since JSON cannot handle number keys. +The content of a reply to a :func:queue_request request is a dict, keyed by the engine +IDs. Note that they will be the string representation of the integer keys, since JSON +cannot handle number keys. Message type: ``queue_reply``:: @@ -120,15 +148,19 @@ Message type: ``queue_reply``:: '1' : {'completed' : 10, 'queue' : 1} } -Clients can request individual results directly from the controller. This is primarily for use gathering results of executions not submitted by the particular client, as the client will have all its own results already. Requests are made by msg_id, and can contain one or more msg_id. +Clients can request individual results directly from the controller. This is primarily for +use gathering results of executions not submitted by the particular client, as the client +will have all its own results already. Requests are made by msg_id, and can contain one or +more msg_id. Message type: ``result_request``:: content = { - 'msg_ids' : [uuid,'...'] # list of strs + 'msg_ids' : ['uuid','...'] # list of strs } -The :func:`result_request` reply contains the content objects of the actual execution reply messages +The :func:`result_request` reply contains the content objects of the actual execution +reply messages Message type: ``result_reply``:: @@ -139,13 +171,18 @@ Message type: ``result_reply``:: msg_id : msg, # the content dict is keyed by msg_ids, # values are the result messages 'pending' : ['msg_id','...'], # msg_ids still pending + 'completed' : ['msg_id','...'], # list of completed msg_ids # if error: 'reason' : "explanation" } -Clients can also instruct the controller to forget the results of messages. This can be done by message ID or engine ID. Individual messages are dropped by msg_id, and all messages completed on an engine are dropped by engine ID. +For memory management purposes, Clients can also instruct the controller to forget the +results of messages. This can be done by message ID or engine ID. Individual messages are +dropped by msg_id, and all messages completed on an engine are dropped by engine ID. This will likely no longer +be necessary once we move to a DB-based message logging backend. -If the msg_ids element is the string ``'all'`` instead of a list, then all completed results are forgotten. +If the msg_ids element is the string ``'all'`` instead of a list, then all completed +results are forgotten. Message type: ``purge_request``:: @@ -154,7 +191,9 @@ Message type: ``purge_request``:: 'engine_ids' : [0,2,4] # list of engine IDs } -The reply to a purge request is simply the status 'ok' if the request succeeded, or an explanation of why it failed, such as requesting the purge of a nonexistent or pending message. +The reply to a purge request is simply the status 'ok' if the request succeeded, or an +explanation of why it failed, such as requesting the purge of a nonexistent or pending +message. Message type: ``purge_reply``:: @@ -168,18 +207,29 @@ Message type: ``purge_reply``:: :func:`apply` and :func:`apply_bound` ************************************* -The `Namespace `_ model suggests that execution be able to use the model:: +The `Namespace `_ model suggests that execution be able to +use the model:: client.apply(f, *args, **kwargs) -which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)`` on a remote engine, returning the result (or, for non-blocking, information facilitating later retrieval of the result). This model, unlike the execute message which just uses a code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy as little data as we can. The `buffers` property of a Message was introduced for this purpose. +which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)`` +on a remote engine, returning the result (or, for non-blocking, information facilitating +later retrieval of the result). This model, unlike the execute message which just uses a +code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy +as little data as we can. The `buffers` property of a Message was introduced for this +purpose. -Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a function signature and builds the correct buffer format. +Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a +function signature and builds the correct buffer format for minimal data copying (exactly +zero copies of numpy array data). Message type: ``apply_request``:: content = { - 'bound' : True # whether to execute in the engine's namespace or unbound + 'bound' : True, # whether to execute in the engine's namespace or unbound + 'after' : [msg_ids,...], # list of msg_ids or output of Dependency.as_dict() + 'follow' : [msg_ids,...], # list of msg_ids or output of Dependency.as_dict() + } buffers = ['...'] # at least 3 in length # as built by build_apply_message(f,args,kwargs) @@ -200,10 +250,22 @@ Message type: ``apply_reply``:: Implementation -------------- -There are a few differences in implementation between the `StreamSession` object used in the parallel computing fork and the `Session` object, the main one being that messages are sent in parts, rather than as a single serialized object. `StreamSession` objects also take pack/unpack functions, which are to be used when serializing/deserializing objects. These can be any functions that translate to/from formats that ZMQ sockets can send (buffers,bytes, etc.). +There are a few differences in implementation between the `StreamSession` object used in +the parallel computing fork and the `Session` object, the main one being that messages are +sent in parts, rather than as a single serialized object. `StreamSession` objects also +take pack/unpack functions, which are to be used when serializing/deserializing objects. +These can be any functions that translate to/from formats that ZMQ sockets can send +(buffers,bytes, etc.). Split Sends *********** -Previously, messages were bundled as a single json object and one call to :func:`socket.send_json`. Since the controller inspects all messages, and doesn't need to see the content of the messages, which can be large, messages are serialized and sent in pieces. All messages are sent in at least 3 parts: the header, the parent header, and the content. This allows the controller to unpack and inspect the (always small) header, without spending time unpacking the content unless the message is bound for the controller. Buffers are added on to the end of the message, and can be any objects that present the buffer interface. +Previously, messages were bundled as a single json object and one call to +:func:`socket.send_json`. Since the controller inspects all messages, and doesn't need to +see the content of the messages, which can be large, messages are now serialized and sent in +pieces. All messages are sent in at least 3 parts: the header, the parent header, and the +content. This allows the controller to unpack and inspect the (always small) header, +without spending time unpacking the content unless the message is bound for the +controller. Buffers are added on to the end of the message, and can be any objects that +present the buffer interface. diff --git a/docs/source/whatsnew/development.txt b/docs/source/whatsnew/development.txt index 8a7c4a0..b08118b 100644 --- a/docs/source/whatsnew/development.txt +++ b/docs/source/whatsnew/development.txt @@ -69,11 +69,11 @@ New features :mod:`~IPython.external.argparse` to parse command line options for :command:`ipython`. -* New top level :func:`~IPython.core.embed.embed` function that can be called +* New top level :func:`~IPython.frontend.terminal.embed.embed` function that can be called to embed IPython at any place in user's code. One the first call it will - create an :class:`~IPython.core.embed.InteractiveShellEmbed` instance and + create an :class:`~IPython.frontend.terminal.embed.InteractiveShellEmbed` instance and call it. In later calls, it just calls the previously created - :class:`~IPython.core.embed.InteractiveShellEmbed`. + :class:`~IPython.frontend.terminal.embed.InteractiveShellEmbed`. * Created a component system (:mod:`IPython.core.component`) that is based on :mod:`IPython.utils.traitlets`. Components are arranged into a runtime