##// END OF EJS Templates
Replace links to launchpad bugs in comments/docstrings with equivalent github links.
Replace links to launchpad bugs in comments/docstrings with equivalent github links.

File last commit:

r3873:6549d091
r3917:03c097c7
Show More
scheduler.py
665 lines | 24.7 KiB | text/x-python | PythonLexer
MinRK
general parallel code cleanup
r3556 """The Python scheduler for rich scheduling.
The Pure ZMQ scheduler does not allow routing schemes other than LRU,
nor does it check msg_id DAG dependencies. For those, a slightly slower
Python Scheduler exists.
"""
MinRK
copyright statements
r3660 #-----------------------------------------------------------------------------
# Copyright (C) 2010-2011 The IPython Development Team
#
# Distributed under the terms of the BSD License. The full license is in
# the file COPYING, distributed as part of this software.
#-----------------------------------------------------------------------------
MinRK
general parallel code cleanup
r3556
MinRK
added dependencies & Python scheduler
r3548 #----------------------------------------------------------------------
# Imports
#----------------------------------------------------------------------
MinRK
use print_function
r3553 from __future__ import print_function
MinRK
resort imports in a cleaner order
r3631
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 import logging
MinRK
resort imports in a cleaner order
r3631 import sys
from datetime import datetime, timedelta
MinRK
add timeout for unmet dependencies in task scheduler
r3611 from random import randint, random
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 from types import FunctionType
MinRK
resort imports in a cleaner order
r3631
MinRK
added dependencies & Python scheduler
r3548 try:
import numpy
except ImportError:
numpy = None
import zmq
from zmq.eventloop import ioloop, zmqstream
# local imports
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 from IPython.external.decorator import decorator
MinRK
pass config obj to Scheduler as dict...
r3773 from IPython.config.loader import Config
MinRK
add HWM to TaskScheduler...
r3781 from IPython.utils.traitlets import Instance, Dict, List, Set, Int
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603
MinRK
organize IPython.parallel into subpackages
r3673 from IPython.parallel import error
from IPython.parallel.factory import SessionFactory
from IPython.parallel.util import connect_logger, local_logger
MinRK
added dependencies & Python scheduler
r3548
MinRK
organize IPython.parallel into subpackages
r3673 from .dependency import Dependency
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603
MinRK
added dependencies & Python scheduler
r3548 @decorator
def logged(f,self,*args,**kwargs):
MinRK
propagate iopub to clients
r3602 # print ("#--------------------")
MinRK
rework logging connections
r3610 self.log.debug("scheduler::%s(*%s,**%s)"%(f.func_name, args, kwargs))
MinRK
propagate iopub to clients
r3602 # print ("#--")
MinRK
added dependencies & Python scheduler
r3548 return f(self,*args, **kwargs)
#----------------------------------------------------------------------
# Chooser functions
#----------------------------------------------------------------------
def plainrandom(loads):
"""Plain random pick."""
n = len(loads)
return randint(0,n-1)
def lru(loads):
"""Always pick the front of the line.
MinRK
general parallel code cleanup
r3556 The content of `loads` is ignored.
MinRK
added dependencies & Python scheduler
r3548
Assumes LRU ordering of loads, with oldest first.
"""
return 0
def twobin(loads):
"""Pick two at random, use the LRU of the two.
The content of loads is ignored.
Assumes LRU ordering of loads, with oldest first.
"""
n = len(loads)
a = randint(0,n-1)
b = randint(0,n-1)
return min(a,b)
def weighted(loads):
"""Pick two at random using inverse load as weight.
Return the less loaded of the two.
"""
# weight 0 a million times more than 1:
weights = 1./(1e-6+numpy.array(loads))
sums = weights.cumsum()
t = sums[-1]
x = random()*t
y = random()*t
idx = 0
idy = 0
while sums[idx] < x:
idx += 1
while sums[idy] < y:
idy += 1
if weights[idy] > weights[idx]:
return idy
else:
return idx
def leastload(loads):
"""Always choose the lowest load.
If the lowest load occurs more than once, the first
occurance will be used. If loads has LRU ordering, this means
the LRU of those with the lowest load is chosen.
"""
return loads.index(min(loads))
#---------------------------------------------------------------------
# Classes
#---------------------------------------------------------------------
MinRK
Improvements to dependency handling...
r3607 # store empty default dependency:
MET = Dependency([])
MinRK
add timeout for unmet dependencies in task scheduler
r3611 class TaskScheduler(SessionFactory):
MinRK
scheduler progress
r3551 """Python TaskScheduler object.
MinRK
added dependencies & Python scheduler
r3548
This is the simplest object that supports msg_id based
DAG dependencies. *Only* task msg_ids are checked, not
msg_ids of jobs submitted via the MUX queue.
"""
MinRK
add HWM to TaskScheduler...
r3781 hwm = Int(0, config=True) # limit number of outstanding tasks
MinRK
Refactor newparallel to use Config system...
r3604 # input arguments:
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 scheme = Instance(FunctionType, default=leastload) # function for determining the destination
client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
# internals:
MinRK
dependency tweaks + dependency/scheduler docs
r3624 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
MinRK
add retries flag to LoadBalancedView...
r3873 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
MinRK
add HWM to TaskScheduler...
r3781 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
MinRK
Refactor newparallel to use Config system...
r3604 depending = Dict() # dict by msg_id of (msg_id, raw_msg, after, follow)
pending = Dict() # dict by engine_uuid of submitted tasks
completed = Dict() # dict by engine_uuid of completed tasks
MinRK
Improvements to dependency handling...
r3607 failed = Dict() # dict by engine_uuid of failed tasks
destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
MinRK
Refactor newparallel to use Config system...
r3604 clients = Dict() # dict by msg_id for who submitted the task
targets = List() # list of target IDENTs
loads = List() # list of engine loads
MinRK
add HWM to TaskScheduler...
r3781 # full = Set() # set of IDENTs that have HWM outstanding tasks
MinRK
Improvements to dependency handling...
r3607 all_completed = Set() # set of all completed tasks
all_failed = Set() # set of all failed tasks
all_done = Set() # set of all finished tasks=union(completed,failed)
MinRK
dependency tweaks + dependency/scheduler docs
r3624 all_ids = Set() # set of all submitted task IDs
MinRK
Refactor newparallel to use Config system...
r3604 blacklist = Dict() # dict by msg_id of locations where a job has encountered UnmetDependency
MinRK
add timeout for unmet dependencies in task scheduler
r3611 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
MinRK
added dependencies & Python scheduler
r3548
MinRK
add timeout for unmet dependencies in task scheduler
r3611 def start(self):
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 self.engine_stream.on_recv(self.dispatch_result, copy=False)
MinRK
added dependencies & Python scheduler
r3548 self._notification_handlers = dict(
registration_notification = self._register_engine,
unregistration_notification = self._unregister_engine
)
self.notifier_stream.on_recv(self.dispatch_notification)
MinRK
tasks on engines when they die fail instead of hang...
r3612 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
MinRK
add timeout for unmet dependencies in task scheduler
r3611 self.auditor.start()
MinRK
rework logging connections
r3610 self.log.info("Scheduler started...%r"%self)
MinRK
added dependencies & Python scheduler
r3548
def resume_receiving(self):
MinRK
general parallel code cleanup
r3556 """Resume accepting jobs."""
MinRK
added dependencies & Python scheduler
r3548 self.client_stream.on_recv(self.dispatch_submission, copy=False)
def stop_receiving(self):
MinRK
general parallel code cleanup
r3556 """Stop accepting jobs while there are no engines.
Leave them in the ZMQ queue."""
MinRK
added dependencies & Python scheduler
r3548 self.client_stream.on_recv(None)
#-----------------------------------------------------------------------
# [Un]Registration Handling
#-----------------------------------------------------------------------
def dispatch_notification(self, msg):
"""dispatch register/unregister events."""
idents,msg = self.session.feed_identities(msg)
msg = self.session.unpack_message(msg)
msg_type = msg['msg_type']
handler = self._notification_handlers.get(msg_type, None)
if handler is None:
raise Exception("Unhandled message type: %s"%msg_type)
else:
try:
handler(str(msg['content']['queue']))
except KeyError:
MinRK
rework logging connections
r3610 self.log.error("task::Invalid notification msg: %s"%msg)
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603
MinRK
added dependencies & Python scheduler
r3548 @logged
def _register_engine(self, uid):
MinRK
general parallel code cleanup
r3556 """New engine with ident `uid` became available."""
MinRK
added dependencies & Python scheduler
r3548 # head of the line:
self.targets.insert(0,uid)
self.loads.insert(0,0)
# initialize sets
self.completed[uid] = set()
MinRK
Improvements to dependency handling...
r3607 self.failed[uid] = set()
MinRK
added dependencies & Python scheduler
r3548 self.pending[uid] = {}
if len(self.targets) == 1:
self.resume_receiving()
MinRK
add retries flag to LoadBalancedView...
r3873 # rescan the graph:
self.update_graph(None)
MinRK
added dependencies & Python scheduler
r3548
def _unregister_engine(self, uid):
MinRK
general parallel code cleanup
r3556 """Existing engine with ident `uid` became unavailable."""
MinRK
added dependencies & Python scheduler
r3548 if len(self.targets) == 1:
MinRK
general parallel code cleanup
r3556 # this was our only engine
MinRK
added dependencies & Python scheduler
r3548 self.stop_receiving()
MinRK
general parallel code cleanup
r3556
# handle any potentially finished tasks:
MinRK
added dependencies & Python scheduler
r3548 self.engine_stream.flush()
MinRK
add retries flag to LoadBalancedView...
r3873 # don't pop destinations, because they might be used later
MinRK
Improvements to dependency handling...
r3607 # map(self.destinations.pop, self.completed.pop(uid))
# map(self.destinations.pop, self.failed.pop(uid))
MinRK
add retries flag to LoadBalancedView...
r3873
# prevent this engine from receiving work
MinRK
added dependencies & Python scheduler
r3548 idx = self.targets.index(uid)
self.targets.pop(idx)
self.loads.pop(idx)
MinRK
tasks on engines when they die fail instead of hang...
r3612 # wait 5 seconds before cleaning up pending jobs, since the results might
# still be incoming
if self.pending[uid]:
dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
dc.start()
MinRK
add retries flag to LoadBalancedView...
r3873 else:
self.completed.pop(uid)
self.failed.pop(uid)
MinRK
added dependencies & Python scheduler
r3548
MinRK
tasks on engines when they die fail instead of hang...
r3612 @logged
def handle_stranded_tasks(self, engine):
MinRK
general parallel code cleanup
r3556 """Deal with jobs resident in an engine that died."""
MinRK
add retries flag to LoadBalancedView...
r3873 lost = self.pending[engine]
for msg_id in lost.keys():
if msg_id not in self.pending[engine]:
# prevent double-handling of messages
continue
raw_msg = lost[msg_id][0]
MinRK
tasks on engines when they die fail instead of hang...
r3612 idents,msg = self.session.feed_identities(raw_msg, copy=False)
msg = self.session.unpack_message(msg, copy=False, content=False)
parent = msg['header']
MinRK
add retries flag to LoadBalancedView...
r3873 idents = [engine, idents[0]]
# build fake error reply
MinRK
tasks on engines when they die fail instead of hang...
r3612 try:
raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
except:
MinRK
cleanup pass
r3644 content = error.wrap_exception()
MinRK
add retries flag to LoadBalancedView...
r3873 msg = self.session.msg('apply_reply', content, parent=parent, subheader={'status':'error'})
raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
# and dispatch it
self.dispatch_result(raw_reply)
# finally scrub completed/failed lists
self.completed.pop(engine)
self.failed.pop(engine)
MinRK
added dependencies & Python scheduler
r3548
#-----------------------------------------------------------------------
# Job Submission
#-----------------------------------------------------------------------
@logged
def dispatch_submission(self, raw_msg):
MinRK
general parallel code cleanup
r3556 """Dispatch job submission to appropriate handlers."""
MinRK
added dependencies & Python scheduler
r3548 # ensure targets up to date:
self.notifier_stream.flush()
try:
idents, msg = self.session.feed_identities(raw_msg, copy=False)
MinRK
dependency tweaks + dependency/scheduler docs
r3624 msg = self.session.unpack_message(msg, content=False, copy=False)
MinRK
add HWM to TaskScheduler...
r3781 except Exception:
MinRK
dependency tweaks + dependency/scheduler docs
r3624 self.log.error("task::Invaid task: %s"%raw_msg, exc_info=True)
MinRK
added dependencies & Python scheduler
r3548 return
MinRK
quiet down scheduler printing, fix dep_id check in update_dependencies
r3563 # send to monitor
self.mon_stream.send_multipart(['intask']+raw_msg, copy=False)
MinRK
added dependencies & Python scheduler
r3548 header = msg['header']
msg_id = header['msg_id']
MinRK
dependency tweaks + dependency/scheduler docs
r3624 self.all_ids.add(msg_id)
MinRK
general parallel code cleanup
r3556
MinRK
allow load-balancing across subsets of engines
r3625 # targets
targets = set(header.get('targets', []))
MinRK
add retries flag to LoadBalancedView...
r3873 retries = header.get('retries', 0)
self.retries[msg_id] = retries
MinRK
allow load-balancing across subsets of engines
r3625
MinRK
general parallel code cleanup
r3556 # time dependencies
MinRK
added dependencies & Python scheduler
r3548 after = Dependency(header.get('after', []))
MinRK
dependency tweaks + dependency/scheduler docs
r3624 if after.all:
MinRK
update API after sagedays29...
r3664 if after.success:
after.difference_update(self.all_completed)
if after.failure:
MinRK
Improvements to dependency handling...
r3607 after.difference_update(self.all_failed)
if after.check(self.all_completed, self.all_failed):
MinRK
general parallel code cleanup
r3556 # recast as empty set, if `after` already met,
# to prevent unnecessary set comparisons
MinRK
Improvements to dependency handling...
r3607 after = MET
MinRK
added dependencies & Python scheduler
r3548
MinRK
general parallel code cleanup
r3556 # location dependencies
MinRK
added dependencies & Python scheduler
r3548 follow = Dependency(header.get('follow', []))
MinRK
dependency tweaks + dependency/scheduler docs
r3624
MinRK
allow load-balancing across subsets of engines
r3625 # turn timeouts into datetime objects:
timeout = header.get('timeout', None)
if timeout:
timeout = datetime.now() + timedelta(0,timeout,0)
args = [raw_msg, targets, after, follow, timeout]
# validate and reduce dependencies:
MinRK
dependency tweaks + dependency/scheduler docs
r3624 for dep in after,follow:
# check valid:
if msg_id in dep or dep.difference(self.all_ids):
MinRK
allow load-balancing across subsets of engines
r3625 self.depending[msg_id] = args
MinRK
dependency tweaks + dependency/scheduler docs
r3624 return self.fail_unreachable(msg_id, error.InvalidDependency)
# check if unreachable:
MinRK
update API after sagedays29...
r3664 if dep.unreachable(self.all_completed, self.all_failed):
MinRK
allow load-balancing across subsets of engines
r3625 self.depending[msg_id] = args
MinRK
dependency tweaks + dependency/scheduler docs
r3624 return self.fail_unreachable(msg_id)
MinRK
Improvements to dependency handling...
r3607
if after.check(self.all_completed, self.all_failed):
MinRK
added dependencies & Python scheduler
r3548 # time deps already met, try to run
MinRK
allow load-balancing across subsets of engines
r3625 if not self.maybe_run(msg_id, *args):
MinRK
added dependencies & Python scheduler
r3548 # can't run yet
MinRK
add retries flag to LoadBalancedView...
r3873 if msg_id not in self.all_failed:
# could have failed as unreachable
self.save_unmet(msg_id, *args)
MinRK
added dependencies & Python scheduler
r3548 else:
MinRK
allow load-balancing across subsets of engines
r3625 self.save_unmet(msg_id, *args)
MinRK
general parallel code cleanup
r3556
MinRK
newparallel tweaks, fixes...
r3622 # @logged
MinRK
add timeout for unmet dependencies in task scheduler
r3611 def audit_timeouts(self):
"""Audit all waiting tasks for expired timeouts."""
now = datetime.now()
for msg_id in self.depending.keys():
# must recheck, in case one failure cascaded to another:
if msg_id in self.depending:
MinRK
allow load-balancing across subsets of engines
r3625 raw,after,targets,follow,timeout = self.depending[msg_id]
MinRK
add timeout for unmet dependencies in task scheduler
r3611 if timeout and timeout < now:
MinRK
add retries flag to LoadBalancedView...
r3873 self.fail_unreachable(msg_id, error.TaskTimeout)
MinRK
add timeout for unmet dependencies in task scheduler
r3611
@logged
MinRK
dependency tweaks + dependency/scheduler docs
r3624 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
MinRK
allow load-balancing across subsets of engines
r3625 """a task has become unreachable, send a reply with an ImpossibleDependency
error."""
MinRK
Improvements to dependency handling...
r3607 if msg_id not in self.depending:
MinRK
rework logging connections
r3610 self.log.error("msg %r already failed!"%msg_id)
MinRK
Improvements to dependency handling...
r3607 return
MinRK
allow load-balancing across subsets of engines
r3625 raw_msg,targets,after,follow,timeout = self.depending.pop(msg_id)
MinRK
Improvements to dependency handling...
r3607 for mid in follow.union(after):
MinRK
dependency tweaks + dependency/scheduler docs
r3624 if mid in self.graph:
self.graph[mid].remove(msg_id)
MinRK
Improvements to dependency handling...
r3607
MinRK
add timeout for unmet dependencies in task scheduler
r3611 # FIXME: unpacking a message I've already unpacked, but didn't save:
MinRK
Improvements to dependency handling...
r3607 idents,msg = self.session.feed_identities(raw_msg, copy=False)
msg = self.session.unpack_message(msg, copy=False, content=False)
header = msg['header']
try:
MinRK
dependency tweaks + dependency/scheduler docs
r3624 raise why()
MinRK
Improvements to dependency handling...
r3607 except:
MinRK
cleanup pass
r3644 content = error.wrap_exception()
MinRK
Improvements to dependency handling...
r3607
self.all_done.add(msg_id)
self.all_failed.add(msg_id)
msg = self.session.send(self.client_stream, 'apply_reply', content,
parent=header, ident=idents)
self.session.send(self.mon_stream, msg, ident=['outtask']+idents)
MinRK
dependency tweaks + dependency/scheduler docs
r3624 self.update_graph(msg_id, success=False)
MinRK
Improvements to dependency handling...
r3607
@logged
MinRK
allow load-balancing across subsets of engines
r3625 def maybe_run(self, msg_id, raw_msg, targets, after, follow, timeout):
MinRK
added dependencies & Python scheduler
r3548 """check location dependencies, and run if they are met."""
MinRK
allow load-balancing across subsets of engines
r3625 blacklist = self.blacklist.setdefault(msg_id, set())
MinRK
add HWM to TaskScheduler...
r3781 if follow or targets or blacklist or self.hwm:
MinRK
allow load-balancing across subsets of engines
r3625 # we need a can_run filter
MinRK
added dependencies & Python scheduler
r3548 def can_run(idx):
MinRK
add HWM to TaskScheduler...
r3781 # check hwm
MinRK
add retries flag to LoadBalancedView...
r3873 if self.hwm and self.loads[idx] == self.hwm:
MinRK
allow load-balancing across subsets of engines
r3625 return False
MinRK
add HWM to TaskScheduler...
r3781 target = self.targets[idx]
MinRK
allow load-balancing across subsets of engines
r3625 # check blacklist
if target in blacklist:
return False
MinRK
add HWM to TaskScheduler...
r3781 # check targets
if targets and target not in targets:
return False
MinRK
allow load-balancing across subsets of engines
r3625 # check follow
return follow.check(self.completed[target], self.failed[target])
MinRK
added dependencies & Python scheduler
r3548
indices = filter(can_run, range(len(self.targets)))
MinRK
add retries flag to LoadBalancedView...
r3873
MinRK
added dependencies & Python scheduler
r3548 if not indices:
MinRK
allow load-balancing across subsets of engines
r3625 # couldn't run
MinRK
dependency tweaks + dependency/scheduler docs
r3624 if follow.all:
MinRK
allow load-balancing across subsets of engines
r3625 # check follow for impossibility
MinRK
Improvements to dependency handling...
r3607 dests = set()
MinRK
update API after sagedays29...
r3664 relevant = set()
if follow.success:
relevant = self.all_completed
if follow.failure:
relevant = relevant.union(self.all_failed)
MinRK
Improvements to dependency handling...
r3607 for m in follow.intersection(relevant):
dests.add(self.destinations[m])
if len(dests) > 1:
MinRK
add retries flag to LoadBalancedView...
r3873 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
MinRK
Improvements to dependency handling...
r3607 self.fail_unreachable(msg_id)
MinRK
allow load-balancing across subsets of engines
r3625 return False
if targets:
# check blacklist+targets for impossibility
targets.difference_update(blacklist)
if not targets or not targets.intersection(self.targets):
MinRK
add retries flag to LoadBalancedView...
r3873 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
MinRK
allow load-balancing across subsets of engines
r3625 self.fail_unreachable(msg_id)
return False
MinRK
added dependencies & Python scheduler
r3548 return False
else:
indices = None
MinRK
allow load-balancing across subsets of engines
r3625 self.submit_task(msg_id, raw_msg, targets, follow, timeout, indices)
MinRK
added dependencies & Python scheduler
r3548 return True
@logged
MinRK
allow load-balancing across subsets of engines
r3625 def save_unmet(self, msg_id, raw_msg, targets, after, follow, timeout):
MinRK
added dependencies & Python scheduler
r3548 """Save a message for later submission when its dependencies are met."""
MinRK
allow load-balancing across subsets of engines
r3625 self.depending[msg_id] = [raw_msg,targets,after,follow,timeout]
MinRK
Improvements to dependency handling...
r3607 # track the ids in follow or after, but not those already finished
MinRK
added dependencies & Python scheduler
r3548 for dep_id in after.union(follow).difference(self.all_done):
MinRK
dependency tweaks + dependency/scheduler docs
r3624 if dep_id not in self.graph:
self.graph[dep_id] = set()
self.graph[dep_id].add(msg_id)
MinRK
scheduler progress
r3551
MinRK
added dependencies & Python scheduler
r3548 @logged
MinRK
allow load-balancing across subsets of engines
r3625 def submit_task(self, msg_id, raw_msg, targets, follow, timeout, indices=None):
MinRK
general parallel code cleanup
r3556 """Submit a task to any of a subset of our targets."""
MinRK
added dependencies & Python scheduler
r3548 if indices:
loads = [self.loads[i] for i in indices]
else:
loads = self.loads
idx = self.scheme(loads)
if indices:
idx = indices[idx]
target = self.targets[idx]
MinRK
quiet down scheduler printing, fix dep_id check in update_dependencies
r3563 # print (target, map(str, msg[:3]))
MinRK
add HWM to TaskScheduler...
r3781 # send job to the engine
MinRK
scheduler progress
r3551 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
MinRK
tasks on engines when they die fail instead of hang...
r3612 self.engine_stream.send_multipart(raw_msg, copy=False)
MinRK
add HWM to TaskScheduler...
r3781 # update load
MinRK
added dependencies & Python scheduler
r3548 self.add_job(idx)
MinRK
allow load-balancing across subsets of engines
r3625 self.pending[target][msg_id] = (raw_msg, targets, MET, follow, timeout)
MinRK
add HWM to TaskScheduler...
r3781 # notify Hub
MinRK
general parallel code cleanup
r3556 content = dict(msg_id=msg_id, engine_id=target)
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 self.session.send(self.mon_stream, 'task_destination', content=content,
ident=['tracktask',self.session.session])
MinRK
add HWM to TaskScheduler...
r3781
MinRK
added dependencies & Python scheduler
r3548
#-----------------------------------------------------------------------
# Result Handling
#-----------------------------------------------------------------------
@logged
def dispatch_result(self, raw_msg):
MinRK
allow load-balancing across subsets of engines
r3625 """dispatch method for result replies"""
MinRK
added dependencies & Python scheduler
r3548 try:
idents,msg = self.session.feed_identities(raw_msg, copy=False)
MinRK
dependency tweaks + dependency/scheduler docs
r3624 msg = self.session.unpack_message(msg, content=False, copy=False)
MinRK
add HWM to TaskScheduler...
r3781 engine = idents[0]
MinRK
add retries flag to LoadBalancedView...
r3873 try:
idx = self.targets.index(engine)
except ValueError:
pass # skip load-update for dead engines
else:
self.finish_job(idx)
MinRK
add HWM to TaskScheduler...
r3781 except Exception:
MinRK
dependency tweaks + dependency/scheduler docs
r3624 self.log.error("task::Invaid result: %s"%raw_msg, exc_info=True)
MinRK
added dependencies & Python scheduler
r3548 return
MinRK
add HWM to TaskScheduler...
r3781
MinRK
added dependencies & Python scheduler
r3548 header = msg['header']
MinRK
add retries flag to LoadBalancedView...
r3873 parent = msg['parent_header']
MinRK
added dependencies & Python scheduler
r3548 if header.get('dependencies_met', True):
MinRK
Improvements to dependency handling...
r3607 success = (header['status'] == 'ok')
MinRK
add retries flag to LoadBalancedView...
r3873 msg_id = parent['msg_id']
retries = self.retries[msg_id]
if not success and retries > 0:
# failed
self.retries[msg_id] = retries - 1
self.handle_unmet_dependency(idents, parent)
else:
del self.retries[msg_id]
# relay to client and update graph
self.handle_result(idents, parent, raw_msg, success)
# send to Hub monitor
self.mon_stream.send_multipart(['outtask']+raw_msg, copy=False)
MinRK
added dependencies & Python scheduler
r3548 else:
MinRK
add retries flag to LoadBalancedView...
r3873 self.handle_unmet_dependency(idents, parent)
MinRK
added dependencies & Python scheduler
r3548
@logged
MinRK
Improvements to dependency handling...
r3607 def handle_result(self, idents, parent, raw_msg, success=True):
MinRK
allow load-balancing across subsets of engines
r3625 """handle a real task result, either success or failure"""
MinRK
added dependencies & Python scheduler
r3548 # first, relay result to client
engine = idents[0]
client = idents[1]
# swap_ids for XREP-XREP mirror
raw_msg[:2] = [client,engine]
MinRK
quiet down scheduler printing, fix dep_id check in update_dependencies
r3563 # print (map(str, raw_msg[:4]))
MinRK
added dependencies & Python scheduler
r3548 self.client_stream.send_multipart(raw_msg, copy=False)
# now, update our data structures
msg_id = parent['msg_id']
MinRK
tasks on engines when they die fail instead of hang...
r3612 self.blacklist.pop(msg_id, None)
MinRK
added dependencies & Python scheduler
r3548 self.pending[engine].pop(msg_id)
MinRK
Improvements to dependency handling...
r3607 if success:
self.completed[engine].add(msg_id)
self.all_completed.add(msg_id)
else:
self.failed[engine].add(msg_id)
self.all_failed.add(msg_id)
MinRK
add all completed task IDs to Scheduler.all_done
r3565 self.all_done.add(msg_id)
MinRK
Improvements to dependency handling...
r3607 self.destinations[msg_id] = engine
MinRK
added dependencies & Python scheduler
r3548
MinRK
dependency tweaks + dependency/scheduler docs
r3624 self.update_graph(msg_id, success)
MinRK
added dependencies & Python scheduler
r3548
@logged
def handle_unmet_dependency(self, idents, parent):
MinRK
allow load-balancing across subsets of engines
r3625 """handle an unmet dependency"""
MinRK
added dependencies & Python scheduler
r3548 engine = idents[0]
msg_id = parent['msg_id']
MinRK
allow load-balancing across subsets of engines
r3625
MinRK
added dependencies & Python scheduler
r3548 if msg_id not in self.blacklist:
self.blacklist[msg_id] = set()
self.blacklist[msg_id].add(engine)
MinRK
allow load-balancing across subsets of engines
r3625
args = self.pending[engine].pop(msg_id)
raw,targets,after,follow,timeout = args
if self.blacklist[msg_id] == targets:
self.depending[msg_id] = args
MinRK
add HWM to TaskScheduler...
r3781 self.fail_unreachable(msg_id)
MinRK
allow load-balancing across subsets of engines
r3625 elif not self.maybe_run(msg_id, *args):
MinRK
add retries flag to LoadBalancedView...
r3873 # resubmit failed
if msg_id not in self.all_failed:
# put it back in our dependency tree
self.save_unmet(msg_id, *args)
MinRK
allow load-balancing across subsets of engines
r3625
MinRK
add HWM to TaskScheduler...
r3781 if self.hwm:
MinRK
add retries flag to LoadBalancedView...
r3873 try:
idx = self.targets.index(engine)
except ValueError:
pass # skip load-update for dead engines
else:
if self.loads[idx] == self.hwm-1:
self.update_graph(None)
MinRK
add HWM to TaskScheduler...
r3781
MinRK
Improvements to dependency handling...
r3607
MinRK
added dependencies & Python scheduler
r3548 @logged
MinRK
add HWM to TaskScheduler...
r3781 def update_graph(self, dep_id=None, success=True):
MinRK
added dependencies & Python scheduler
r3548 """dep_id just finished. Update our dependency
MinRK
add HWM to TaskScheduler...
r3781 graph and submit any jobs that just became runable.
MinRK
add retries flag to LoadBalancedView...
r3873 Called with dep_id=None to update entire graph for hwm, but without finishing
MinRK
add HWM to TaskScheduler...
r3781 a task.
"""
MinRK
Improvements to dependency handling...
r3607 # print ("\n\n***********")
# pprint (dep_id)
MinRK
dependency tweaks + dependency/scheduler docs
r3624 # pprint (self.graph)
MinRK
Improvements to dependency handling...
r3607 # pprint (self.depending)
# pprint (self.all_completed)
# pprint (self.all_failed)
# print ("\n\n***********\n\n")
MinRK
add HWM to TaskScheduler...
r3781 # update any jobs that depended on the dependency
jobs = self.graph.pop(dep_id, [])
MinRK
add retries flag to LoadBalancedView...
r3873
# recheck *all* jobs if
# a) we have HWM and an engine just become no longer full
# or b) dep_id was given as None
if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
MinRK
add HWM to TaskScheduler...
r3781 jobs = self.depending.keys()
MinRK
Improvements to dependency handling...
r3607
for msg_id in jobs:
MinRK
allow load-balancing across subsets of engines
r3625 raw_msg, targets, after, follow, timeout = self.depending[msg_id]
MinRK
Improvements to dependency handling...
r3607
MinRK
update API after sagedays29...
r3664 if after.unreachable(self.all_completed, self.all_failed) or follow.unreachable(self.all_completed, self.all_failed):
MinRK
Improvements to dependency handling...
r3607 self.fail_unreachable(msg_id)
elif after.check(self.all_completed, self.all_failed): # time deps met, maybe run
MinRK
allow load-balancing across subsets of engines
r3625 if self.maybe_run(msg_id, raw_msg, targets, MET, follow, timeout):
MinRK
Improvements to dependency handling...
r3607
self.depending.pop(msg_id)
for mid in follow.union(after):
MinRK
dependency tweaks + dependency/scheduler docs
r3624 if mid in self.graph:
self.graph[mid].remove(msg_id)
MinRK
added dependencies & Python scheduler
r3548
#----------------------------------------------------------------------
# methods to be overridden by subclasses
#----------------------------------------------------------------------
def add_job(self, idx):
"""Called after self.targets[idx] just got the job with header.
Override with subclasses. The default ordering is simple LRU.
The default loads are the number of outstanding jobs."""
self.loads[idx] += 1
for lis in (self.targets, self.loads):
lis.append(lis.pop(idx))
def finish_job(self, idx):
"""Called after self.targets[idx] just finished a job.
Override with subclasses."""
self.loads[idx] -= 1
MinRK
dependency tweaks + dependency/scheduler docs
r3624 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, config=None,logname='ZMQ',
MinRK
update connections and diagrams for reduced sockets
r3658 log_addr=None, loglevel=logging.DEBUG, scheme='lru',
identity=b'task'):
MinRK
added dependencies & Python scheduler
r3548 from zmq.eventloop import ioloop
from zmq.eventloop.zmqstream import ZMQStream
MinRK
pass config obj to Scheduler as dict...
r3773 if config:
# unwrap dict back into Config
config = Config(config)
MinRK
added dependencies & Python scheduler
r3548 ctx = zmq.Context()
loop = ioloop.IOLoop()
ins = ZMQStream(ctx.socket(zmq.XREP),loop)
MinRK
update connections and diagrams for reduced sockets
r3658 ins.setsockopt(zmq.IDENTITY, identity)
MinRK
added dependencies & Python scheduler
r3548 ins.bind(in_addr)
MinRK
update connections and diagrams for reduced sockets
r3658
MinRK
added dependencies & Python scheduler
r3548 outs = ZMQStream(ctx.socket(zmq.XREP),loop)
MinRK
update connections and diagrams for reduced sockets
r3658 outs.setsockopt(zmq.IDENTITY, identity)
MinRK
added dependencies & Python scheduler
r3548 outs.bind(out_addr)
mons = ZMQStream(ctx.socket(zmq.PUB),loop)
mons.connect(mon_addr)
nots = ZMQStream(ctx.socket(zmq.SUB),loop)
nots.setsockopt(zmq.SUBSCRIBE, '')
nots.connect(not_addr)
MinRK
Refactor newparallel to use Config system...
r3604 scheme = globals().get(scheme, None)
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 # setup logging
if log_addr:
MinRK
rework logging connections
r3610 connect_logger(logname, ctx, log_addr, root="scheduler", loglevel=loglevel)
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603 else:
MinRK
rework logging connections
r3610 local_logger(logname, loglevel)
MinRK
improved logging + Hub,Engine,Scheduler are Configurable
r3603
scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
MinRK
add timeout for unmet dependencies in task scheduler
r3611 mon_stream=mons, notifier_stream=nots,
MinRK
newparallel tweaks, fixes...
r3622 scheme=scheme, loop=loop, logname=logname,
config=config)
MinRK
add timeout for unmet dependencies in task scheduler
r3611 scheduler.start()
MinRK
Refactor newparallel to use Config system...
r3604 try:
loop.start()
except KeyboardInterrupt:
print ("interrupted, exiting...", file=sys.__stderr__)
MinRK
added dependencies & Python scheduler
r3548