##// END OF EJS Templates
enforce ascii identities in parallel code...
MinRK -
Show More
@@ -1,173 +1,173
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 """
2 """
3 A multi-heart Heartbeat system using PUB and XREP sockets. pings are sent out on the PUB,
3 A multi-heart Heartbeat system using PUB and XREP sockets. pings are sent out on the PUB,
4 and hearts are tracked based on their XREQ identities.
4 and hearts are tracked based on their XREQ identities.
5
5
6 Authors:
6 Authors:
7
7
8 * Min RK
8 * Min RK
9 """
9 """
10 #-----------------------------------------------------------------------------
10 #-----------------------------------------------------------------------------
11 # Copyright (C) 2010-2011 The IPython Development Team
11 # Copyright (C) 2010-2011 The IPython Development Team
12 #
12 #
13 # Distributed under the terms of the BSD License. The full license is in
13 # Distributed under the terms of the BSD License. The full license is in
14 # the file COPYING, distributed as part of this software.
14 # the file COPYING, distributed as part of this software.
15 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
16
16
17 from __future__ import print_function
17 from __future__ import print_function
18 import time
18 import time
19 import uuid
19 import uuid
20
20
21 import zmq
21 import zmq
22 from zmq.devices import ThreadDevice
22 from zmq.devices import ThreadDevice
23 from zmq.eventloop import ioloop, zmqstream
23 from zmq.eventloop import ioloop, zmqstream
24
24
25 from IPython.config.configurable import LoggingConfigurable
25 from IPython.config.configurable import LoggingConfigurable
26 from IPython.utils.traitlets import Set, Instance, CFloat
26 from IPython.utils.traitlets import Set, Instance, CFloat
27
27
28 from IPython.parallel.util import ensure_bytes
28 from IPython.parallel.util import ensure_bytes
29
29
30 class Heart(object):
30 class Heart(object):
31 """A basic heart object for responding to a HeartMonitor.
31 """A basic heart object for responding to a HeartMonitor.
32 This is a simple wrapper with defaults for the most common
32 This is a simple wrapper with defaults for the most common
33 Device model for responding to heartbeats.
33 Device model for responding to heartbeats.
34
34
35 It simply builds a threadsafe zmq.FORWARDER Device, defaulting to using
35 It simply builds a threadsafe zmq.FORWARDER Device, defaulting to using
36 SUB/XREQ for in/out.
36 SUB/XREQ for in/out.
37
37
38 You can specify the XREQ's IDENTITY via the optional heart_id argument."""
38 You can specify the XREQ's IDENTITY via the optional heart_id argument."""
39 device=None
39 device=None
40 id=None
40 id=None
41 def __init__(self, in_addr, out_addr, in_type=zmq.SUB, out_type=zmq.XREQ, heart_id=None):
41 def __init__(self, in_addr, out_addr, in_type=zmq.SUB, out_type=zmq.XREQ, heart_id=None):
42 self.device = ThreadDevice(zmq.FORWARDER, in_type, out_type)
42 self.device = ThreadDevice(zmq.FORWARDER, in_type, out_type)
43 self.device.daemon=True
43 self.device.daemon=True
44 self.device.connect_in(in_addr)
44 self.device.connect_in(in_addr)
45 self.device.connect_out(out_addr)
45 self.device.connect_out(out_addr)
46 if in_type == zmq.SUB:
46 if in_type == zmq.SUB:
47 self.device.setsockopt_in(zmq.SUBSCRIBE, b"")
47 self.device.setsockopt_in(zmq.SUBSCRIBE, b"")
48 if heart_id is None:
48 if heart_id is None:
49 heart_id = ensure_bytes(uuid.uuid4())
49 heart_id = uuid.uuid4().bytes
50 self.device.setsockopt_out(zmq.IDENTITY, heart_id)
50 self.device.setsockopt_out(zmq.IDENTITY, heart_id)
51 self.id = heart_id
51 self.id = heart_id
52
52
53 def start(self):
53 def start(self):
54 return self.device.start()
54 return self.device.start()
55
55
56 class HeartMonitor(LoggingConfigurable):
56 class HeartMonitor(LoggingConfigurable):
57 """A basic HeartMonitor class
57 """A basic HeartMonitor class
58 pingstream: a PUB stream
58 pingstream: a PUB stream
59 pongstream: an XREP stream
59 pongstream: an XREP stream
60 period: the period of the heartbeat in milliseconds"""
60 period: the period of the heartbeat in milliseconds"""
61
61
62 period=CFloat(1000, config=True,
62 period=CFloat(1000, config=True,
63 help='The frequency at which the Hub pings the engines for heartbeats '
63 help='The frequency at which the Hub pings the engines for heartbeats '
64 ' (in ms) [default: 100]',
64 ' (in ms) [default: 100]',
65 )
65 )
66
66
67 pingstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
67 pingstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
68 pongstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
68 pongstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
69 loop = Instance('zmq.eventloop.ioloop.IOLoop')
69 loop = Instance('zmq.eventloop.ioloop.IOLoop')
70 def _loop_default(self):
70 def _loop_default(self):
71 return ioloop.IOLoop.instance()
71 return ioloop.IOLoop.instance()
72
72
73 # not settable:
73 # not settable:
74 hearts=Set()
74 hearts=Set()
75 responses=Set()
75 responses=Set()
76 on_probation=Set()
76 on_probation=Set()
77 last_ping=CFloat(0)
77 last_ping=CFloat(0)
78 _new_handlers = Set()
78 _new_handlers = Set()
79 _failure_handlers = Set()
79 _failure_handlers = Set()
80 lifetime = CFloat(0)
80 lifetime = CFloat(0)
81 tic = CFloat(0)
81 tic = CFloat(0)
82
82
83 def __init__(self, **kwargs):
83 def __init__(self, **kwargs):
84 super(HeartMonitor, self).__init__(**kwargs)
84 super(HeartMonitor, self).__init__(**kwargs)
85
85
86 self.pongstream.on_recv(self.handle_pong)
86 self.pongstream.on_recv(self.handle_pong)
87
87
88 def start(self):
88 def start(self):
89 self.caller = ioloop.PeriodicCallback(self.beat, self.period, self.loop)
89 self.caller = ioloop.PeriodicCallback(self.beat, self.period, self.loop)
90 self.caller.start()
90 self.caller.start()
91
91
92 def add_new_heart_handler(self, handler):
92 def add_new_heart_handler(self, handler):
93 """add a new handler for new hearts"""
93 """add a new handler for new hearts"""
94 self.log.debug("heartbeat::new_heart_handler: %s"%handler)
94 self.log.debug("heartbeat::new_heart_handler: %s"%handler)
95 self._new_handlers.add(handler)
95 self._new_handlers.add(handler)
96
96
97 def add_heart_failure_handler(self, handler):
97 def add_heart_failure_handler(self, handler):
98 """add a new handler for heart failure"""
98 """add a new handler for heart failure"""
99 self.log.debug("heartbeat::new heart failure handler: %s"%handler)
99 self.log.debug("heartbeat::new heart failure handler: %s"%handler)
100 self._failure_handlers.add(handler)
100 self._failure_handlers.add(handler)
101
101
102 def beat(self):
102 def beat(self):
103 self.pongstream.flush()
103 self.pongstream.flush()
104 self.last_ping = self.lifetime
104 self.last_ping = self.lifetime
105
105
106 toc = time.time()
106 toc = time.time()
107 self.lifetime += toc-self.tic
107 self.lifetime += toc-self.tic
108 self.tic = toc
108 self.tic = toc
109 # self.log.debug("heartbeat::%s"%self.lifetime)
109 # self.log.debug("heartbeat::%s"%self.lifetime)
110 goodhearts = self.hearts.intersection(self.responses)
110 goodhearts = self.hearts.intersection(self.responses)
111 missed_beats = self.hearts.difference(goodhearts)
111 missed_beats = self.hearts.difference(goodhearts)
112 heartfailures = self.on_probation.intersection(missed_beats)
112 heartfailures = self.on_probation.intersection(missed_beats)
113 newhearts = self.responses.difference(goodhearts)
113 newhearts = self.responses.difference(goodhearts)
114 map(self.handle_new_heart, newhearts)
114 map(self.handle_new_heart, newhearts)
115 map(self.handle_heart_failure, heartfailures)
115 map(self.handle_heart_failure, heartfailures)
116 self.on_probation = missed_beats.intersection(self.hearts)
116 self.on_probation = missed_beats.intersection(self.hearts)
117 self.responses = set()
117 self.responses = set()
118 # print self.on_probation, self.hearts
118 # print self.on_probation, self.hearts
119 # self.log.debug("heartbeat::beat %.3f, %i beating hearts"%(self.lifetime, len(self.hearts)))
119 # self.log.debug("heartbeat::beat %.3f, %i beating hearts"%(self.lifetime, len(self.hearts)))
120 self.pingstream.send(ensure_bytes(str(self.lifetime)))
120 self.pingstream.send(ensure_bytes(str(self.lifetime)))
121
121
122 def handle_new_heart(self, heart):
122 def handle_new_heart(self, heart):
123 if self._new_handlers:
123 if self._new_handlers:
124 for handler in self._new_handlers:
124 for handler in self._new_handlers:
125 handler(heart)
125 handler(heart)
126 else:
126 else:
127 self.log.info("heartbeat::yay, got new heart %s!"%heart)
127 self.log.info("heartbeat::yay, got new heart %s!"%heart)
128 self.hearts.add(heart)
128 self.hearts.add(heart)
129
129
130 def handle_heart_failure(self, heart):
130 def handle_heart_failure(self, heart):
131 if self._failure_handlers:
131 if self._failure_handlers:
132 for handler in self._failure_handlers:
132 for handler in self._failure_handlers:
133 try:
133 try:
134 handler(heart)
134 handler(heart)
135 except Exception as e:
135 except Exception as e:
136 self.log.error("heartbeat::Bad Handler! %s"%handler, exc_info=True)
136 self.log.error("heartbeat::Bad Handler! %s"%handler, exc_info=True)
137 pass
137 pass
138 else:
138 else:
139 self.log.info("heartbeat::Heart %s failed :("%heart)
139 self.log.info("heartbeat::Heart %s failed :("%heart)
140 self.hearts.remove(heart)
140 self.hearts.remove(heart)
141
141
142
142
143 def handle_pong(self, msg):
143 def handle_pong(self, msg):
144 "a heart just beat"
144 "a heart just beat"
145 current = ensure_bytes(str(self.lifetime))
145 current = ensure_bytes(str(self.lifetime))
146 last = ensure_bytes(str(self.last_ping))
146 last = ensure_bytes(str(self.last_ping))
147 if msg[1] == current:
147 if msg[1] == current:
148 delta = time.time()-self.tic
148 delta = time.time()-self.tic
149 # self.log.debug("heartbeat::heart %r took %.2f ms to respond"%(msg[0], 1000*delta))
149 # self.log.debug("heartbeat::heart %r took %.2f ms to respond"%(msg[0], 1000*delta))
150 self.responses.add(msg[0])
150 self.responses.add(msg[0])
151 elif msg[1] == last:
151 elif msg[1] == last:
152 delta = time.time()-self.tic + (self.lifetime-self.last_ping)
152 delta = time.time()-self.tic + (self.lifetime-self.last_ping)
153 self.log.warn("heartbeat::heart %r missed a beat, and took %.2f ms to respond"%(msg[0], 1000*delta))
153 self.log.warn("heartbeat::heart %r missed a beat, and took %.2f ms to respond"%(msg[0], 1000*delta))
154 self.responses.add(msg[0])
154 self.responses.add(msg[0])
155 else:
155 else:
156 self.log.warn("heartbeat::got bad heartbeat (possibly old?): %s (current=%.3f)"%
156 self.log.warn("heartbeat::got bad heartbeat (possibly old?): %s (current=%.3f)"%
157 (msg[1],self.lifetime))
157 (msg[1],self.lifetime))
158
158
159
159
160 if __name__ == '__main__':
160 if __name__ == '__main__':
161 loop = ioloop.IOLoop.instance()
161 loop = ioloop.IOLoop.instance()
162 context = zmq.Context()
162 context = zmq.Context()
163 pub = context.socket(zmq.PUB)
163 pub = context.socket(zmq.PUB)
164 pub.bind('tcp://127.0.0.1:5555')
164 pub.bind('tcp://127.0.0.1:5555')
165 xrep = context.socket(zmq.XREP)
165 xrep = context.socket(zmq.XREP)
166 xrep.bind('tcp://127.0.0.1:5556')
166 xrep.bind('tcp://127.0.0.1:5556')
167
167
168 outstream = zmqstream.ZMQStream(pub, loop)
168 outstream = zmqstream.ZMQStream(pub, loop)
169 instream = zmqstream.ZMQStream(xrep, loop)
169 instream = zmqstream.ZMQStream(xrep, loop)
170
170
171 hb = HeartMonitor(loop, outstream, instream)
171 hb = HeartMonitor(loop, outstream, instream)
172
172
173 loop.start()
173 loop.start()
@@ -1,1291 +1,1291
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 """The IPython Controller Hub with 0MQ
2 """The IPython Controller Hub with 0MQ
3 This is the master object that handles connections from engines and clients,
3 This is the master object that handles connections from engines and clients,
4 and monitors traffic through the various queues.
4 and monitors traffic through the various queues.
5
5
6 Authors:
6 Authors:
7
7
8 * Min RK
8 * Min RK
9 """
9 """
10 #-----------------------------------------------------------------------------
10 #-----------------------------------------------------------------------------
11 # Copyright (C) 2010 The IPython Development Team
11 # Copyright (C) 2010 The IPython Development Team
12 #
12 #
13 # Distributed under the terms of the BSD License. The full license is in
13 # Distributed under the terms of the BSD License. The full license is in
14 # the file COPYING, distributed as part of this software.
14 # the file COPYING, distributed as part of this software.
15 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
16
16
17 #-----------------------------------------------------------------------------
17 #-----------------------------------------------------------------------------
18 # Imports
18 # Imports
19 #-----------------------------------------------------------------------------
19 #-----------------------------------------------------------------------------
20 from __future__ import print_function
20 from __future__ import print_function
21
21
22 import sys
22 import sys
23 import time
23 import time
24 from datetime import datetime
24 from datetime import datetime
25
25
26 import zmq
26 import zmq
27 from zmq.eventloop import ioloop
27 from zmq.eventloop import ioloop
28 from zmq.eventloop.zmqstream import ZMQStream
28 from zmq.eventloop.zmqstream import ZMQStream
29
29
30 # internal:
30 # internal:
31 from IPython.utils.importstring import import_item
31 from IPython.utils.importstring import import_item
32 from IPython.utils.traitlets import (
32 from IPython.utils.traitlets import (
33 HasTraits, Instance, Int, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
33 HasTraits, Instance, Int, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
34 )
34 )
35
35
36 from IPython.parallel import error, util
36 from IPython.parallel import error, util
37 from IPython.parallel.factory import RegistrationFactory
37 from IPython.parallel.factory import RegistrationFactory
38
38
39 from IPython.zmq.session import SessionFactory
39 from IPython.zmq.session import SessionFactory
40
40
41 from .heartmonitor import HeartMonitor
41 from .heartmonitor import HeartMonitor
42
42
43 #-----------------------------------------------------------------------------
43 #-----------------------------------------------------------------------------
44 # Code
44 # Code
45 #-----------------------------------------------------------------------------
45 #-----------------------------------------------------------------------------
46
46
47 def _passer(*args, **kwargs):
47 def _passer(*args, **kwargs):
48 return
48 return
49
49
50 def _printer(*args, **kwargs):
50 def _printer(*args, **kwargs):
51 print (args)
51 print (args)
52 print (kwargs)
52 print (kwargs)
53
53
54 def empty_record():
54 def empty_record():
55 """Return an empty dict with all record keys."""
55 """Return an empty dict with all record keys."""
56 return {
56 return {
57 'msg_id' : None,
57 'msg_id' : None,
58 'header' : None,
58 'header' : None,
59 'content': None,
59 'content': None,
60 'buffers': None,
60 'buffers': None,
61 'submitted': None,
61 'submitted': None,
62 'client_uuid' : None,
62 'client_uuid' : None,
63 'engine_uuid' : None,
63 'engine_uuid' : None,
64 'started': None,
64 'started': None,
65 'completed': None,
65 'completed': None,
66 'resubmitted': None,
66 'resubmitted': None,
67 'result_header' : None,
67 'result_header' : None,
68 'result_content' : None,
68 'result_content' : None,
69 'result_buffers' : None,
69 'result_buffers' : None,
70 'queue' : None,
70 'queue' : None,
71 'pyin' : None,
71 'pyin' : None,
72 'pyout': None,
72 'pyout': None,
73 'pyerr': None,
73 'pyerr': None,
74 'stdout': '',
74 'stdout': '',
75 'stderr': '',
75 'stderr': '',
76 }
76 }
77
77
78 def init_record(msg):
78 def init_record(msg):
79 """Initialize a TaskRecord based on a request."""
79 """Initialize a TaskRecord based on a request."""
80 header = msg['header']
80 header = msg['header']
81 return {
81 return {
82 'msg_id' : header['msg_id'],
82 'msg_id' : header['msg_id'],
83 'header' : header,
83 'header' : header,
84 'content': msg['content'],
84 'content': msg['content'],
85 'buffers': msg['buffers'],
85 'buffers': msg['buffers'],
86 'submitted': header['date'],
86 'submitted': header['date'],
87 'client_uuid' : None,
87 'client_uuid' : None,
88 'engine_uuid' : None,
88 'engine_uuid' : None,
89 'started': None,
89 'started': None,
90 'completed': None,
90 'completed': None,
91 'resubmitted': None,
91 'resubmitted': None,
92 'result_header' : None,
92 'result_header' : None,
93 'result_content' : None,
93 'result_content' : None,
94 'result_buffers' : None,
94 'result_buffers' : None,
95 'queue' : None,
95 'queue' : None,
96 'pyin' : None,
96 'pyin' : None,
97 'pyout': None,
97 'pyout': None,
98 'pyerr': None,
98 'pyerr': None,
99 'stdout': '',
99 'stdout': '',
100 'stderr': '',
100 'stderr': '',
101 }
101 }
102
102
103
103
104 class EngineConnector(HasTraits):
104 class EngineConnector(HasTraits):
105 """A simple object for accessing the various zmq connections of an object.
105 """A simple object for accessing the various zmq connections of an object.
106 Attributes are:
106 Attributes are:
107 id (int): engine ID
107 id (int): engine ID
108 uuid (str): uuid (unused?)
108 uuid (str): uuid (unused?)
109 queue (str): identity of queue's XREQ socket
109 queue (str): identity of queue's XREQ socket
110 registration (str): identity of registration XREQ socket
110 registration (str): identity of registration XREQ socket
111 heartbeat (str): identity of heartbeat XREQ socket
111 heartbeat (str): identity of heartbeat XREQ socket
112 """
112 """
113 id=Int(0)
113 id=Int(0)
114 queue=CBytes()
114 queue=CBytes()
115 control=CBytes()
115 control=CBytes()
116 registration=CBytes()
116 registration=CBytes()
117 heartbeat=CBytes()
117 heartbeat=CBytes()
118 pending=Set()
118 pending=Set()
119
119
120 class HubFactory(RegistrationFactory):
120 class HubFactory(RegistrationFactory):
121 """The Configurable for setting up a Hub."""
121 """The Configurable for setting up a Hub."""
122
122
123 # port-pairs for monitoredqueues:
123 # port-pairs for monitoredqueues:
124 hb = Tuple(Int,Int,config=True,
124 hb = Tuple(Int,Int,config=True,
125 help="""XREQ/SUB Port pair for Engine heartbeats""")
125 help="""XREQ/SUB Port pair for Engine heartbeats""")
126 def _hb_default(self):
126 def _hb_default(self):
127 return tuple(util.select_random_ports(2))
127 return tuple(util.select_random_ports(2))
128
128
129 mux = Tuple(Int,Int,config=True,
129 mux = Tuple(Int,Int,config=True,
130 help="""Engine/Client Port pair for MUX queue""")
130 help="""Engine/Client Port pair for MUX queue""")
131
131
132 def _mux_default(self):
132 def _mux_default(self):
133 return tuple(util.select_random_ports(2))
133 return tuple(util.select_random_ports(2))
134
134
135 task = Tuple(Int,Int,config=True,
135 task = Tuple(Int,Int,config=True,
136 help="""Engine/Client Port pair for Task queue""")
136 help="""Engine/Client Port pair for Task queue""")
137 def _task_default(self):
137 def _task_default(self):
138 return tuple(util.select_random_ports(2))
138 return tuple(util.select_random_ports(2))
139
139
140 control = Tuple(Int,Int,config=True,
140 control = Tuple(Int,Int,config=True,
141 help="""Engine/Client Port pair for Control queue""")
141 help="""Engine/Client Port pair for Control queue""")
142
142
143 def _control_default(self):
143 def _control_default(self):
144 return tuple(util.select_random_ports(2))
144 return tuple(util.select_random_ports(2))
145
145
146 iopub = Tuple(Int,Int,config=True,
146 iopub = Tuple(Int,Int,config=True,
147 help="""Engine/Client Port pair for IOPub relay""")
147 help="""Engine/Client Port pair for IOPub relay""")
148
148
149 def _iopub_default(self):
149 def _iopub_default(self):
150 return tuple(util.select_random_ports(2))
150 return tuple(util.select_random_ports(2))
151
151
152 # single ports:
152 # single ports:
153 mon_port = Int(config=True,
153 mon_port = Int(config=True,
154 help="""Monitor (SUB) port for queue traffic""")
154 help="""Monitor (SUB) port for queue traffic""")
155
155
156 def _mon_port_default(self):
156 def _mon_port_default(self):
157 return util.select_random_ports(1)[0]
157 return util.select_random_ports(1)[0]
158
158
159 notifier_port = Int(config=True,
159 notifier_port = Int(config=True,
160 help="""PUB port for sending engine status notifications""")
160 help="""PUB port for sending engine status notifications""")
161
161
162 def _notifier_port_default(self):
162 def _notifier_port_default(self):
163 return util.select_random_ports(1)[0]
163 return util.select_random_ports(1)[0]
164
164
165 engine_ip = Unicode('127.0.0.1', config=True,
165 engine_ip = Unicode('127.0.0.1', config=True,
166 help="IP on which to listen for engine connections. [default: loopback]")
166 help="IP on which to listen for engine connections. [default: loopback]")
167 engine_transport = Unicode('tcp', config=True,
167 engine_transport = Unicode('tcp', config=True,
168 help="0MQ transport for engine connections. [default: tcp]")
168 help="0MQ transport for engine connections. [default: tcp]")
169
169
170 client_ip = Unicode('127.0.0.1', config=True,
170 client_ip = Unicode('127.0.0.1', config=True,
171 help="IP on which to listen for client connections. [default: loopback]")
171 help="IP on which to listen for client connections. [default: loopback]")
172 client_transport = Unicode('tcp', config=True,
172 client_transport = Unicode('tcp', config=True,
173 help="0MQ transport for client connections. [default : tcp]")
173 help="0MQ transport for client connections. [default : tcp]")
174
174
175 monitor_ip = Unicode('127.0.0.1', config=True,
175 monitor_ip = Unicode('127.0.0.1', config=True,
176 help="IP on which to listen for monitor messages. [default: loopback]")
176 help="IP on which to listen for monitor messages. [default: loopback]")
177 monitor_transport = Unicode('tcp', config=True,
177 monitor_transport = Unicode('tcp', config=True,
178 help="0MQ transport for monitor messages. [default : tcp]")
178 help="0MQ transport for monitor messages. [default : tcp]")
179
179
180 monitor_url = Unicode('')
180 monitor_url = Unicode('')
181
181
182 db_class = DottedObjectName('IPython.parallel.controller.dictdb.DictDB',
182 db_class = DottedObjectName('IPython.parallel.controller.dictdb.DictDB',
183 config=True, help="""The class to use for the DB backend""")
183 config=True, help="""The class to use for the DB backend""")
184
184
185 # not configurable
185 # not configurable
186 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
186 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
187 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
187 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
188
188
189 def _ip_changed(self, name, old, new):
189 def _ip_changed(self, name, old, new):
190 self.engine_ip = new
190 self.engine_ip = new
191 self.client_ip = new
191 self.client_ip = new
192 self.monitor_ip = new
192 self.monitor_ip = new
193 self._update_monitor_url()
193 self._update_monitor_url()
194
194
195 def _update_monitor_url(self):
195 def _update_monitor_url(self):
196 self.monitor_url = "%s://%s:%i"%(self.monitor_transport, self.monitor_ip, self.mon_port)
196 self.monitor_url = "%s://%s:%i"%(self.monitor_transport, self.monitor_ip, self.mon_port)
197
197
198 def _transport_changed(self, name, old, new):
198 def _transport_changed(self, name, old, new):
199 self.engine_transport = new
199 self.engine_transport = new
200 self.client_transport = new
200 self.client_transport = new
201 self.monitor_transport = new
201 self.monitor_transport = new
202 self._update_monitor_url()
202 self._update_monitor_url()
203
203
204 def __init__(self, **kwargs):
204 def __init__(self, **kwargs):
205 super(HubFactory, self).__init__(**kwargs)
205 super(HubFactory, self).__init__(**kwargs)
206 self._update_monitor_url()
206 self._update_monitor_url()
207
207
208
208
209 def construct(self):
209 def construct(self):
210 self.init_hub()
210 self.init_hub()
211
211
212 def start(self):
212 def start(self):
213 self.heartmonitor.start()
213 self.heartmonitor.start()
214 self.log.info("Heartmonitor started")
214 self.log.info("Heartmonitor started")
215
215
216 def init_hub(self):
216 def init_hub(self):
217 """construct"""
217 """construct"""
218 client_iface = "%s://%s:"%(self.client_transport, self.client_ip) + "%i"
218 client_iface = "%s://%s:"%(self.client_transport, self.client_ip) + "%i"
219 engine_iface = "%s://%s:"%(self.engine_transport, self.engine_ip) + "%i"
219 engine_iface = "%s://%s:"%(self.engine_transport, self.engine_ip) + "%i"
220
220
221 ctx = self.context
221 ctx = self.context
222 loop = self.loop
222 loop = self.loop
223
223
224 # Registrar socket
224 # Registrar socket
225 q = ZMQStream(ctx.socket(zmq.XREP), loop)
225 q = ZMQStream(ctx.socket(zmq.XREP), loop)
226 q.bind(client_iface % self.regport)
226 q.bind(client_iface % self.regport)
227 self.log.info("Hub listening on %s for registration."%(client_iface%self.regport))
227 self.log.info("Hub listening on %s for registration."%(client_iface%self.regport))
228 if self.client_ip != self.engine_ip:
228 if self.client_ip != self.engine_ip:
229 q.bind(engine_iface % self.regport)
229 q.bind(engine_iface % self.regport)
230 self.log.info("Hub listening on %s for registration."%(engine_iface%self.regport))
230 self.log.info("Hub listening on %s for registration."%(engine_iface%self.regport))
231
231
232 ### Engine connections ###
232 ### Engine connections ###
233
233
234 # heartbeat
234 # heartbeat
235 hpub = ctx.socket(zmq.PUB)
235 hpub = ctx.socket(zmq.PUB)
236 hpub.bind(engine_iface % self.hb[0])
236 hpub.bind(engine_iface % self.hb[0])
237 hrep = ctx.socket(zmq.XREP)
237 hrep = ctx.socket(zmq.XREP)
238 hrep.bind(engine_iface % self.hb[1])
238 hrep.bind(engine_iface % self.hb[1])
239 self.heartmonitor = HeartMonitor(loop=loop, config=self.config, log=self.log,
239 self.heartmonitor = HeartMonitor(loop=loop, config=self.config, log=self.log,
240 pingstream=ZMQStream(hpub,loop),
240 pingstream=ZMQStream(hpub,loop),
241 pongstream=ZMQStream(hrep,loop)
241 pongstream=ZMQStream(hrep,loop)
242 )
242 )
243
243
244 ### Client connections ###
244 ### Client connections ###
245 # Notifier socket
245 # Notifier socket
246 n = ZMQStream(ctx.socket(zmq.PUB), loop)
246 n = ZMQStream(ctx.socket(zmq.PUB), loop)
247 n.bind(client_iface%self.notifier_port)
247 n.bind(client_iface%self.notifier_port)
248
248
249 ### build and launch the queues ###
249 ### build and launch the queues ###
250
250
251 # monitor socket
251 # monitor socket
252 sub = ctx.socket(zmq.SUB)
252 sub = ctx.socket(zmq.SUB)
253 sub.setsockopt(zmq.SUBSCRIBE, b"")
253 sub.setsockopt(zmq.SUBSCRIBE, b"")
254 sub.bind(self.monitor_url)
254 sub.bind(self.monitor_url)
255 sub.bind('inproc://monitor')
255 sub.bind('inproc://monitor')
256 sub = ZMQStream(sub, loop)
256 sub = ZMQStream(sub, loop)
257
257
258 # connect the db
258 # connect the db
259 self.log.info('Hub using DB backend: %r'%(self.db_class.split()[-1]))
259 self.log.info('Hub using DB backend: %r'%(self.db_class.split()[-1]))
260 # cdir = self.config.Global.cluster_dir
260 # cdir = self.config.Global.cluster_dir
261 self.db = import_item(str(self.db_class))(session=self.session.session,
261 self.db = import_item(str(self.db_class))(session=self.session.session,
262 config=self.config, log=self.log)
262 config=self.config, log=self.log)
263 time.sleep(.25)
263 time.sleep(.25)
264 try:
264 try:
265 scheme = self.config.TaskScheduler.scheme_name
265 scheme = self.config.TaskScheduler.scheme_name
266 except AttributeError:
266 except AttributeError:
267 from .scheduler import TaskScheduler
267 from .scheduler import TaskScheduler
268 scheme = TaskScheduler.scheme_name.get_default_value()
268 scheme = TaskScheduler.scheme_name.get_default_value()
269 # build connection dicts
269 # build connection dicts
270 self.engine_info = {
270 self.engine_info = {
271 'control' : engine_iface%self.control[1],
271 'control' : engine_iface%self.control[1],
272 'mux': engine_iface%self.mux[1],
272 'mux': engine_iface%self.mux[1],
273 'heartbeat': (engine_iface%self.hb[0], engine_iface%self.hb[1]),
273 'heartbeat': (engine_iface%self.hb[0], engine_iface%self.hb[1]),
274 'task' : engine_iface%self.task[1],
274 'task' : engine_iface%self.task[1],
275 'iopub' : engine_iface%self.iopub[1],
275 'iopub' : engine_iface%self.iopub[1],
276 # 'monitor' : engine_iface%self.mon_port,
276 # 'monitor' : engine_iface%self.mon_port,
277 }
277 }
278
278
279 self.client_info = {
279 self.client_info = {
280 'control' : client_iface%self.control[0],
280 'control' : client_iface%self.control[0],
281 'mux': client_iface%self.mux[0],
281 'mux': client_iface%self.mux[0],
282 'task' : (scheme, client_iface%self.task[0]),
282 'task' : (scheme, client_iface%self.task[0]),
283 'iopub' : client_iface%self.iopub[0],
283 'iopub' : client_iface%self.iopub[0],
284 'notification': client_iface%self.notifier_port
284 'notification': client_iface%self.notifier_port
285 }
285 }
286 self.log.debug("Hub engine addrs: %s"%self.engine_info)
286 self.log.debug("Hub engine addrs: %s"%self.engine_info)
287 self.log.debug("Hub client addrs: %s"%self.client_info)
287 self.log.debug("Hub client addrs: %s"%self.client_info)
288
288
289 # resubmit stream
289 # resubmit stream
290 r = ZMQStream(ctx.socket(zmq.XREQ), loop)
290 r = ZMQStream(ctx.socket(zmq.XREQ), loop)
291 url = util.disambiguate_url(self.client_info['task'][-1])
291 url = util.disambiguate_url(self.client_info['task'][-1])
292 r.setsockopt(zmq.IDENTITY, util.ensure_bytes(self.session.session))
292 r.setsockopt(zmq.IDENTITY, util.ensure_bytes(self.session.session))
293 r.connect(url)
293 r.connect(url)
294
294
295 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
295 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
296 query=q, notifier=n, resubmit=r, db=self.db,
296 query=q, notifier=n, resubmit=r, db=self.db,
297 engine_info=self.engine_info, client_info=self.client_info,
297 engine_info=self.engine_info, client_info=self.client_info,
298 log=self.log)
298 log=self.log)
299
299
300
300
301 class Hub(SessionFactory):
301 class Hub(SessionFactory):
302 """The IPython Controller Hub with 0MQ connections
302 """The IPython Controller Hub with 0MQ connections
303
303
304 Parameters
304 Parameters
305 ==========
305 ==========
306 loop: zmq IOLoop instance
306 loop: zmq IOLoop instance
307 session: Session object
307 session: Session object
308 <removed> context: zmq context for creating new connections (?)
308 <removed> context: zmq context for creating new connections (?)
309 queue: ZMQStream for monitoring the command queue (SUB)
309 queue: ZMQStream for monitoring the command queue (SUB)
310 query: ZMQStream for engine registration and client queries requests (XREP)
310 query: ZMQStream for engine registration and client queries requests (XREP)
311 heartbeat: HeartMonitor object checking the pulse of the engines
311 heartbeat: HeartMonitor object checking the pulse of the engines
312 notifier: ZMQStream for broadcasting engine registration changes (PUB)
312 notifier: ZMQStream for broadcasting engine registration changes (PUB)
313 db: connection to db for out of memory logging of commands
313 db: connection to db for out of memory logging of commands
314 NotImplemented
314 NotImplemented
315 engine_info: dict of zmq connection information for engines to connect
315 engine_info: dict of zmq connection information for engines to connect
316 to the queues.
316 to the queues.
317 client_info: dict of zmq connection information for engines to connect
317 client_info: dict of zmq connection information for engines to connect
318 to the queues.
318 to the queues.
319 """
319 """
320 # internal data structures:
320 # internal data structures:
321 ids=Set() # engine IDs
321 ids=Set() # engine IDs
322 keytable=Dict()
322 keytable=Dict()
323 by_ident=Dict()
323 by_ident=Dict()
324 engines=Dict()
324 engines=Dict()
325 clients=Dict()
325 clients=Dict()
326 hearts=Dict()
326 hearts=Dict()
327 pending=Set()
327 pending=Set()
328 queues=Dict() # pending msg_ids keyed by engine_id
328 queues=Dict() # pending msg_ids keyed by engine_id
329 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
329 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
330 completed=Dict() # completed msg_ids keyed by engine_id
330 completed=Dict() # completed msg_ids keyed by engine_id
331 all_completed=Set() # completed msg_ids keyed by engine_id
331 all_completed=Set() # completed msg_ids keyed by engine_id
332 dead_engines=Set() # completed msg_ids keyed by engine_id
332 dead_engines=Set() # completed msg_ids keyed by engine_id
333 unassigned=Set() # set of task msg_ds not yet assigned a destination
333 unassigned=Set() # set of task msg_ds not yet assigned a destination
334 incoming_registrations=Dict()
334 incoming_registrations=Dict()
335 registration_timeout=Int()
335 registration_timeout=Int()
336 _idcounter=Int(0)
336 _idcounter=Int(0)
337
337
338 # objects from constructor:
338 # objects from constructor:
339 query=Instance(ZMQStream)
339 query=Instance(ZMQStream)
340 monitor=Instance(ZMQStream)
340 monitor=Instance(ZMQStream)
341 notifier=Instance(ZMQStream)
341 notifier=Instance(ZMQStream)
342 resubmit=Instance(ZMQStream)
342 resubmit=Instance(ZMQStream)
343 heartmonitor=Instance(HeartMonitor)
343 heartmonitor=Instance(HeartMonitor)
344 db=Instance(object)
344 db=Instance(object)
345 client_info=Dict()
345 client_info=Dict()
346 engine_info=Dict()
346 engine_info=Dict()
347
347
348
348
349 def __init__(self, **kwargs):
349 def __init__(self, **kwargs):
350 """
350 """
351 # universal:
351 # universal:
352 loop: IOLoop for creating future connections
352 loop: IOLoop for creating future connections
353 session: streamsession for sending serialized data
353 session: streamsession for sending serialized data
354 # engine:
354 # engine:
355 queue: ZMQStream for monitoring queue messages
355 queue: ZMQStream for monitoring queue messages
356 query: ZMQStream for engine+client registration and client requests
356 query: ZMQStream for engine+client registration and client requests
357 heartbeat: HeartMonitor object for tracking engines
357 heartbeat: HeartMonitor object for tracking engines
358 # extra:
358 # extra:
359 db: ZMQStream for db connection (NotImplemented)
359 db: ZMQStream for db connection (NotImplemented)
360 engine_info: zmq address/protocol dict for engine connections
360 engine_info: zmq address/protocol dict for engine connections
361 client_info: zmq address/protocol dict for client connections
361 client_info: zmq address/protocol dict for client connections
362 """
362 """
363
363
364 super(Hub, self).__init__(**kwargs)
364 super(Hub, self).__init__(**kwargs)
365 self.registration_timeout = max(5000, 2*self.heartmonitor.period)
365 self.registration_timeout = max(5000, 2*self.heartmonitor.period)
366
366
367 # validate connection dicts:
367 # validate connection dicts:
368 for k,v in self.client_info.iteritems():
368 for k,v in self.client_info.iteritems():
369 if k == 'task':
369 if k == 'task':
370 util.validate_url_container(v[1])
370 util.validate_url_container(v[1])
371 else:
371 else:
372 util.validate_url_container(v)
372 util.validate_url_container(v)
373 # util.validate_url_container(self.client_info)
373 # util.validate_url_container(self.client_info)
374 util.validate_url_container(self.engine_info)
374 util.validate_url_container(self.engine_info)
375
375
376 # register our callbacks
376 # register our callbacks
377 self.query.on_recv(self.dispatch_query)
377 self.query.on_recv(self.dispatch_query)
378 self.monitor.on_recv(self.dispatch_monitor_traffic)
378 self.monitor.on_recv(self.dispatch_monitor_traffic)
379
379
380 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
380 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
381 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
381 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
382
382
383 self.monitor_handlers = {b'in' : self.save_queue_request,
383 self.monitor_handlers = {b'in' : self.save_queue_request,
384 b'out': self.save_queue_result,
384 b'out': self.save_queue_result,
385 b'intask': self.save_task_request,
385 b'intask': self.save_task_request,
386 b'outtask': self.save_task_result,
386 b'outtask': self.save_task_result,
387 b'tracktask': self.save_task_destination,
387 b'tracktask': self.save_task_destination,
388 b'incontrol': _passer,
388 b'incontrol': _passer,
389 b'outcontrol': _passer,
389 b'outcontrol': _passer,
390 b'iopub': self.save_iopub_message,
390 b'iopub': self.save_iopub_message,
391 }
391 }
392
392
393 self.query_handlers = {'queue_request': self.queue_status,
393 self.query_handlers = {'queue_request': self.queue_status,
394 'result_request': self.get_results,
394 'result_request': self.get_results,
395 'history_request': self.get_history,
395 'history_request': self.get_history,
396 'db_request': self.db_query,
396 'db_request': self.db_query,
397 'purge_request': self.purge_results,
397 'purge_request': self.purge_results,
398 'load_request': self.check_load,
398 'load_request': self.check_load,
399 'resubmit_request': self.resubmit_task,
399 'resubmit_request': self.resubmit_task,
400 'shutdown_request': self.shutdown_request,
400 'shutdown_request': self.shutdown_request,
401 'registration_request' : self.register_engine,
401 'registration_request' : self.register_engine,
402 'unregistration_request' : self.unregister_engine,
402 'unregistration_request' : self.unregister_engine,
403 'connection_request': self.connection_request,
403 'connection_request': self.connection_request,
404 }
404 }
405
405
406 # ignore resubmit replies
406 # ignore resubmit replies
407 self.resubmit.on_recv(lambda msg: None, copy=False)
407 self.resubmit.on_recv(lambda msg: None, copy=False)
408
408
409 self.log.info("hub::created hub")
409 self.log.info("hub::created hub")
410
410
411 @property
411 @property
412 def _next_id(self):
412 def _next_id(self):
413 """gemerate a new ID.
413 """gemerate a new ID.
414
414
415 No longer reuse old ids, just count from 0."""
415 No longer reuse old ids, just count from 0."""
416 newid = self._idcounter
416 newid = self._idcounter
417 self._idcounter += 1
417 self._idcounter += 1
418 return newid
418 return newid
419 # newid = 0
419 # newid = 0
420 # incoming = [id[0] for id in self.incoming_registrations.itervalues()]
420 # incoming = [id[0] for id in self.incoming_registrations.itervalues()]
421 # # print newid, self.ids, self.incoming_registrations
421 # # print newid, self.ids, self.incoming_registrations
422 # while newid in self.ids or newid in incoming:
422 # while newid in self.ids or newid in incoming:
423 # newid += 1
423 # newid += 1
424 # return newid
424 # return newid
425
425
426 #-----------------------------------------------------------------------------
426 #-----------------------------------------------------------------------------
427 # message validation
427 # message validation
428 #-----------------------------------------------------------------------------
428 #-----------------------------------------------------------------------------
429
429
430 def _validate_targets(self, targets):
430 def _validate_targets(self, targets):
431 """turn any valid targets argument into a list of integer ids"""
431 """turn any valid targets argument into a list of integer ids"""
432 if targets is None:
432 if targets is None:
433 # default to all
433 # default to all
434 targets = self.ids
434 targets = self.ids
435
435
436 if isinstance(targets, (int,str,unicode)):
436 if isinstance(targets, (int,str,unicode)):
437 # only one target specified
437 # only one target specified
438 targets = [targets]
438 targets = [targets]
439 _targets = []
439 _targets = []
440 for t in targets:
440 for t in targets:
441 # map raw identities to ids
441 # map raw identities to ids
442 if isinstance(t, (str,unicode)):
442 if isinstance(t, (str,unicode)):
443 t = self.by_ident.get(t, t)
443 t = self.by_ident.get(t, t)
444 _targets.append(t)
444 _targets.append(t)
445 targets = _targets
445 targets = _targets
446 bad_targets = [ t for t in targets if t not in self.ids ]
446 bad_targets = [ t for t in targets if t not in self.ids ]
447 if bad_targets:
447 if bad_targets:
448 raise IndexError("No Such Engine: %r"%bad_targets)
448 raise IndexError("No Such Engine: %r"%bad_targets)
449 if not targets:
449 if not targets:
450 raise IndexError("No Engines Registered")
450 raise IndexError("No Engines Registered")
451 return targets
451 return targets
452
452
453 #-----------------------------------------------------------------------------
453 #-----------------------------------------------------------------------------
454 # dispatch methods (1 per stream)
454 # dispatch methods (1 per stream)
455 #-----------------------------------------------------------------------------
455 #-----------------------------------------------------------------------------
456
456
457
457
458 def dispatch_monitor_traffic(self, msg):
458 def dispatch_monitor_traffic(self, msg):
459 """all ME and Task queue messages come through here, as well as
459 """all ME and Task queue messages come through here, as well as
460 IOPub traffic."""
460 IOPub traffic."""
461 self.log.debug("monitor traffic: %r"%msg[:2])
461 self.log.debug("monitor traffic: %r"%msg[:2])
462 switch = msg[0]
462 switch = msg[0]
463 try:
463 try:
464 idents, msg = self.session.feed_identities(msg[1:])
464 idents, msg = self.session.feed_identities(msg[1:])
465 except ValueError:
465 except ValueError:
466 idents=[]
466 idents=[]
467 if not idents:
467 if not idents:
468 self.log.error("Bad Monitor Message: %r"%msg)
468 self.log.error("Bad Monitor Message: %r"%msg)
469 return
469 return
470 handler = self.monitor_handlers.get(switch, None)
470 handler = self.monitor_handlers.get(switch, None)
471 if handler is not None:
471 if handler is not None:
472 handler(idents, msg)
472 handler(idents, msg)
473 else:
473 else:
474 self.log.error("Invalid monitor topic: %r"%switch)
474 self.log.error("Invalid monitor topic: %r"%switch)
475
475
476
476
477 def dispatch_query(self, msg):
477 def dispatch_query(self, msg):
478 """Route registration requests and queries from clients."""
478 """Route registration requests and queries from clients."""
479 try:
479 try:
480 idents, msg = self.session.feed_identities(msg)
480 idents, msg = self.session.feed_identities(msg)
481 except ValueError:
481 except ValueError:
482 idents = []
482 idents = []
483 if not idents:
483 if not idents:
484 self.log.error("Bad Query Message: %r"%msg)
484 self.log.error("Bad Query Message: %r"%msg)
485 return
485 return
486 client_id = idents[0]
486 client_id = idents[0]
487 try:
487 try:
488 msg = self.session.unpack_message(msg, content=True)
488 msg = self.session.unpack_message(msg, content=True)
489 except Exception:
489 except Exception:
490 content = error.wrap_exception()
490 content = error.wrap_exception()
491 self.log.error("Bad Query Message: %r"%msg, exc_info=True)
491 self.log.error("Bad Query Message: %r"%msg, exc_info=True)
492 self.session.send(self.query, "hub_error", ident=client_id,
492 self.session.send(self.query, "hub_error", ident=client_id,
493 content=content)
493 content=content)
494 return
494 return
495 # print client_id, header, parent, content
495 # print client_id, header, parent, content
496 #switch on message type:
496 #switch on message type:
497 msg_type = msg['msg_type']
497 msg_type = msg['msg_type']
498 self.log.info("client::client %r requested %r"%(client_id, msg_type))
498 self.log.info("client::client %r requested %r"%(client_id, msg_type))
499 handler = self.query_handlers.get(msg_type, None)
499 handler = self.query_handlers.get(msg_type, None)
500 try:
500 try:
501 assert handler is not None, "Bad Message Type: %r"%msg_type
501 assert handler is not None, "Bad Message Type: %r"%msg_type
502 except:
502 except:
503 content = error.wrap_exception()
503 content = error.wrap_exception()
504 self.log.error("Bad Message Type: %r"%msg_type, exc_info=True)
504 self.log.error("Bad Message Type: %r"%msg_type, exc_info=True)
505 self.session.send(self.query, "hub_error", ident=client_id,
505 self.session.send(self.query, "hub_error", ident=client_id,
506 content=content)
506 content=content)
507 return
507 return
508
508
509 else:
509 else:
510 handler(idents, msg)
510 handler(idents, msg)
511
511
512 def dispatch_db(self, msg):
512 def dispatch_db(self, msg):
513 """"""
513 """"""
514 raise NotImplementedError
514 raise NotImplementedError
515
515
516 #---------------------------------------------------------------------------
516 #---------------------------------------------------------------------------
517 # handler methods (1 per event)
517 # handler methods (1 per event)
518 #---------------------------------------------------------------------------
518 #---------------------------------------------------------------------------
519
519
520 #----------------------- Heartbeat --------------------------------------
520 #----------------------- Heartbeat --------------------------------------
521
521
522 def handle_new_heart(self, heart):
522 def handle_new_heart(self, heart):
523 """handler to attach to heartbeater.
523 """handler to attach to heartbeater.
524 Called when a new heart starts to beat.
524 Called when a new heart starts to beat.
525 Triggers completion of registration."""
525 Triggers completion of registration."""
526 self.log.debug("heartbeat::handle_new_heart(%r)"%heart)
526 self.log.debug("heartbeat::handle_new_heart(%r)"%heart)
527 if heart not in self.incoming_registrations:
527 if heart not in self.incoming_registrations:
528 self.log.info("heartbeat::ignoring new heart: %r"%heart)
528 self.log.info("heartbeat::ignoring new heart: %r"%heart)
529 else:
529 else:
530 self.finish_registration(heart)
530 self.finish_registration(heart)
531
531
532
532
533 def handle_heart_failure(self, heart):
533 def handle_heart_failure(self, heart):
534 """handler to attach to heartbeater.
534 """handler to attach to heartbeater.
535 called when a previously registered heart fails to respond to beat request.
535 called when a previously registered heart fails to respond to beat request.
536 triggers unregistration"""
536 triggers unregistration"""
537 self.log.debug("heartbeat::handle_heart_failure(%r)"%heart)
537 self.log.debug("heartbeat::handle_heart_failure(%r)"%heart)
538 eid = self.hearts.get(heart, None)
538 eid = self.hearts.get(heart, None)
539 queue = self.engines[eid].queue
539 queue = self.engines[eid].queue
540 if eid is None:
540 if eid is None:
541 self.log.info("heartbeat::ignoring heart failure %r"%heart)
541 self.log.info("heartbeat::ignoring heart failure %r"%heart)
542 else:
542 else:
543 self.unregister_engine(heart, dict(content=dict(id=eid, queue=queue)))
543 self.unregister_engine(heart, dict(content=dict(id=eid, queue=queue)))
544
544
545 #----------------------- MUX Queue Traffic ------------------------------
545 #----------------------- MUX Queue Traffic ------------------------------
546
546
547 def save_queue_request(self, idents, msg):
547 def save_queue_request(self, idents, msg):
548 if len(idents) < 2:
548 if len(idents) < 2:
549 self.log.error("invalid identity prefix: %r"%idents)
549 self.log.error("invalid identity prefix: %r"%idents)
550 return
550 return
551 queue_id, client_id = idents[:2]
551 queue_id, client_id = idents[:2]
552 try:
552 try:
553 msg = self.session.unpack_message(msg)
553 msg = self.session.unpack_message(msg)
554 except Exception:
554 except Exception:
555 self.log.error("queue::client %r sent invalid message to %r: %r"%(client_id, queue_id, msg), exc_info=True)
555 self.log.error("queue::client %r sent invalid message to %r: %r"%(client_id, queue_id, msg), exc_info=True)
556 return
556 return
557
557
558 eid = self.by_ident.get(queue_id, None)
558 eid = self.by_ident.get(queue_id, None)
559 if eid is None:
559 if eid is None:
560 self.log.error("queue::target %r not registered"%queue_id)
560 self.log.error("queue::target %r not registered"%queue_id)
561 self.log.debug("queue:: valid are: %r"%(self.by_ident.keys()))
561 self.log.debug("queue:: valid are: %r"%(self.by_ident.keys()))
562 return
562 return
563 record = init_record(msg)
563 record = init_record(msg)
564 msg_id = record['msg_id']
564 msg_id = record['msg_id']
565 # Unicode in records
565 # Unicode in records
566 record['engine_uuid'] = queue_id.decode('utf8', 'replace')
566 record['engine_uuid'] = queue_id.decode('ascii')
567 record['client_uuid'] = client_id.decode('utf8', 'replace')
567 record['client_uuid'] = client_id.decode('ascii')
568 record['queue'] = 'mux'
568 record['queue'] = 'mux'
569
569
570 try:
570 try:
571 # it's posible iopub arrived first:
571 # it's posible iopub arrived first:
572 existing = self.db.get_record(msg_id)
572 existing = self.db.get_record(msg_id)
573 for key,evalue in existing.iteritems():
573 for key,evalue in existing.iteritems():
574 rvalue = record.get(key, None)
574 rvalue = record.get(key, None)
575 if evalue and rvalue and evalue != rvalue:
575 if evalue and rvalue and evalue != rvalue:
576 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
576 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
577 elif evalue and not rvalue:
577 elif evalue and not rvalue:
578 record[key] = evalue
578 record[key] = evalue
579 try:
579 try:
580 self.db.update_record(msg_id, record)
580 self.db.update_record(msg_id, record)
581 except Exception:
581 except Exception:
582 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
582 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
583 except KeyError:
583 except KeyError:
584 try:
584 try:
585 self.db.add_record(msg_id, record)
585 self.db.add_record(msg_id, record)
586 except Exception:
586 except Exception:
587 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
587 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
588
588
589
589
590 self.pending.add(msg_id)
590 self.pending.add(msg_id)
591 self.queues[eid].append(msg_id)
591 self.queues[eid].append(msg_id)
592
592
593 def save_queue_result(self, idents, msg):
593 def save_queue_result(self, idents, msg):
594 if len(idents) < 2:
594 if len(idents) < 2:
595 self.log.error("invalid identity prefix: %r"%idents)
595 self.log.error("invalid identity prefix: %r"%idents)
596 return
596 return
597
597
598 client_id, queue_id = idents[:2]
598 client_id, queue_id = idents[:2]
599 try:
599 try:
600 msg = self.session.unpack_message(msg)
600 msg = self.session.unpack_message(msg)
601 except Exception:
601 except Exception:
602 self.log.error("queue::engine %r sent invalid message to %r: %r"%(
602 self.log.error("queue::engine %r sent invalid message to %r: %r"%(
603 queue_id,client_id, msg), exc_info=True)
603 queue_id,client_id, msg), exc_info=True)
604 return
604 return
605
605
606 eid = self.by_ident.get(queue_id, None)
606 eid = self.by_ident.get(queue_id, None)
607 if eid is None:
607 if eid is None:
608 self.log.error("queue::unknown engine %r is sending a reply: "%queue_id)
608 self.log.error("queue::unknown engine %r is sending a reply: "%queue_id)
609 return
609 return
610
610
611 parent = msg['parent_header']
611 parent = msg['parent_header']
612 if not parent:
612 if not parent:
613 return
613 return
614 msg_id = parent['msg_id']
614 msg_id = parent['msg_id']
615 if msg_id in self.pending:
615 if msg_id in self.pending:
616 self.pending.remove(msg_id)
616 self.pending.remove(msg_id)
617 self.all_completed.add(msg_id)
617 self.all_completed.add(msg_id)
618 self.queues[eid].remove(msg_id)
618 self.queues[eid].remove(msg_id)
619 self.completed[eid].append(msg_id)
619 self.completed[eid].append(msg_id)
620 elif msg_id not in self.all_completed:
620 elif msg_id not in self.all_completed:
621 # it could be a result from a dead engine that died before delivering the
621 # it could be a result from a dead engine that died before delivering the
622 # result
622 # result
623 self.log.warn("queue:: unknown msg finished %r"%msg_id)
623 self.log.warn("queue:: unknown msg finished %r"%msg_id)
624 return
624 return
625 # update record anyway, because the unregistration could have been premature
625 # update record anyway, because the unregistration could have been premature
626 rheader = msg['header']
626 rheader = msg['header']
627 completed = rheader['date']
627 completed = rheader['date']
628 started = rheader.get('started', None)
628 started = rheader.get('started', None)
629 result = {
629 result = {
630 'result_header' : rheader,
630 'result_header' : rheader,
631 'result_content': msg['content'],
631 'result_content': msg['content'],
632 'started' : started,
632 'started' : started,
633 'completed' : completed
633 'completed' : completed
634 }
634 }
635
635
636 result['result_buffers'] = msg['buffers']
636 result['result_buffers'] = msg['buffers']
637 try:
637 try:
638 self.db.update_record(msg_id, result)
638 self.db.update_record(msg_id, result)
639 except Exception:
639 except Exception:
640 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
640 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
641
641
642
642
643 #--------------------- Task Queue Traffic ------------------------------
643 #--------------------- Task Queue Traffic ------------------------------
644
644
645 def save_task_request(self, idents, msg):
645 def save_task_request(self, idents, msg):
646 """Save the submission of a task."""
646 """Save the submission of a task."""
647 client_id = idents[0]
647 client_id = idents[0]
648
648
649 try:
649 try:
650 msg = self.session.unpack_message(msg)
650 msg = self.session.unpack_message(msg)
651 except Exception:
651 except Exception:
652 self.log.error("task::client %r sent invalid task message: %r"%(
652 self.log.error("task::client %r sent invalid task message: %r"%(
653 client_id, msg), exc_info=True)
653 client_id, msg), exc_info=True)
654 return
654 return
655 record = init_record(msg)
655 record = init_record(msg)
656
656
657 record['client_uuid'] = client_id
657 record['client_uuid'] = client_id
658 record['queue'] = 'task'
658 record['queue'] = 'task'
659 header = msg['header']
659 header = msg['header']
660 msg_id = header['msg_id']
660 msg_id = header['msg_id']
661 self.pending.add(msg_id)
661 self.pending.add(msg_id)
662 self.unassigned.add(msg_id)
662 self.unassigned.add(msg_id)
663 try:
663 try:
664 # it's posible iopub arrived first:
664 # it's posible iopub arrived first:
665 existing = self.db.get_record(msg_id)
665 existing = self.db.get_record(msg_id)
666 if existing['resubmitted']:
666 if existing['resubmitted']:
667 for key in ('submitted', 'client_uuid', 'buffers'):
667 for key in ('submitted', 'client_uuid', 'buffers'):
668 # don't clobber these keys on resubmit
668 # don't clobber these keys on resubmit
669 # submitted and client_uuid should be different
669 # submitted and client_uuid should be different
670 # and buffers might be big, and shouldn't have changed
670 # and buffers might be big, and shouldn't have changed
671 record.pop(key)
671 record.pop(key)
672 # still check content,header which should not change
672 # still check content,header which should not change
673 # but are not expensive to compare as buffers
673 # but are not expensive to compare as buffers
674
674
675 for key,evalue in existing.iteritems():
675 for key,evalue in existing.iteritems():
676 if key.endswith('buffers'):
676 if key.endswith('buffers'):
677 # don't compare buffers
677 # don't compare buffers
678 continue
678 continue
679 rvalue = record.get(key, None)
679 rvalue = record.get(key, None)
680 if evalue and rvalue and evalue != rvalue:
680 if evalue and rvalue and evalue != rvalue:
681 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
681 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
682 elif evalue and not rvalue:
682 elif evalue and not rvalue:
683 record[key] = evalue
683 record[key] = evalue
684 try:
684 try:
685 self.db.update_record(msg_id, record)
685 self.db.update_record(msg_id, record)
686 except Exception:
686 except Exception:
687 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
687 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
688 except KeyError:
688 except KeyError:
689 try:
689 try:
690 self.db.add_record(msg_id, record)
690 self.db.add_record(msg_id, record)
691 except Exception:
691 except Exception:
692 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
692 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
693 except Exception:
693 except Exception:
694 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
694 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
695
695
696 def save_task_result(self, idents, msg):
696 def save_task_result(self, idents, msg):
697 """save the result of a completed task."""
697 """save the result of a completed task."""
698 client_id = idents[0]
698 client_id = idents[0]
699 try:
699 try:
700 msg = self.session.unpack_message(msg)
700 msg = self.session.unpack_message(msg)
701 except Exception:
701 except Exception:
702 self.log.error("task::invalid task result message send to %r: %r"%(
702 self.log.error("task::invalid task result message send to %r: %r"%(
703 client_id, msg), exc_info=True)
703 client_id, msg), exc_info=True)
704 return
704 return
705
705
706 parent = msg['parent_header']
706 parent = msg['parent_header']
707 if not parent:
707 if not parent:
708 # print msg
708 # print msg
709 self.log.warn("Task %r had no parent!"%msg)
709 self.log.warn("Task %r had no parent!"%msg)
710 return
710 return
711 msg_id = parent['msg_id']
711 msg_id = parent['msg_id']
712 if msg_id in self.unassigned:
712 if msg_id in self.unassigned:
713 self.unassigned.remove(msg_id)
713 self.unassigned.remove(msg_id)
714
714
715 header = msg['header']
715 header = msg['header']
716 engine_uuid = header.get('engine', None)
716 engine_uuid = header.get('engine', None)
717 eid = self.by_ident.get(engine_uuid, None)
717 eid = self.by_ident.get(engine_uuid, None)
718
718
719 if msg_id in self.pending:
719 if msg_id in self.pending:
720 self.pending.remove(msg_id)
720 self.pending.remove(msg_id)
721 self.all_completed.add(msg_id)
721 self.all_completed.add(msg_id)
722 if eid is not None:
722 if eid is not None:
723 self.completed[eid].append(msg_id)
723 self.completed[eid].append(msg_id)
724 if msg_id in self.tasks[eid]:
724 if msg_id in self.tasks[eid]:
725 self.tasks[eid].remove(msg_id)
725 self.tasks[eid].remove(msg_id)
726 completed = header['date']
726 completed = header['date']
727 started = header.get('started', None)
727 started = header.get('started', None)
728 result = {
728 result = {
729 'result_header' : header,
729 'result_header' : header,
730 'result_content': msg['content'],
730 'result_content': msg['content'],
731 'started' : started,
731 'started' : started,
732 'completed' : completed,
732 'completed' : completed,
733 'engine_uuid': engine_uuid
733 'engine_uuid': engine_uuid
734 }
734 }
735
735
736 result['result_buffers'] = msg['buffers']
736 result['result_buffers'] = msg['buffers']
737 try:
737 try:
738 self.db.update_record(msg_id, result)
738 self.db.update_record(msg_id, result)
739 except Exception:
739 except Exception:
740 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
740 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
741
741
742 else:
742 else:
743 self.log.debug("task::unknown task %r finished"%msg_id)
743 self.log.debug("task::unknown task %r finished"%msg_id)
744
744
745 def save_task_destination(self, idents, msg):
745 def save_task_destination(self, idents, msg):
746 try:
746 try:
747 msg = self.session.unpack_message(msg, content=True)
747 msg = self.session.unpack_message(msg, content=True)
748 except Exception:
748 except Exception:
749 self.log.error("task::invalid task tracking message", exc_info=True)
749 self.log.error("task::invalid task tracking message", exc_info=True)
750 return
750 return
751 content = msg['content']
751 content = msg['content']
752 # print (content)
752 # print (content)
753 msg_id = content['msg_id']
753 msg_id = content['msg_id']
754 engine_uuid = content['engine_id']
754 engine_uuid = content['engine_id']
755 eid = self.by_ident[util.ensure_bytes(engine_uuid)]
755 eid = self.by_ident[util.ensure_bytes(engine_uuid)]
756
756
757 self.log.info("task::task %r arrived on %r"%(msg_id, eid))
757 self.log.info("task::task %r arrived on %r"%(msg_id, eid))
758 if msg_id in self.unassigned:
758 if msg_id in self.unassigned:
759 self.unassigned.remove(msg_id)
759 self.unassigned.remove(msg_id)
760 # else:
760 # else:
761 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
761 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
762
762
763 self.tasks[eid].append(msg_id)
763 self.tasks[eid].append(msg_id)
764 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
764 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
765 try:
765 try:
766 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
766 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
767 except Exception:
767 except Exception:
768 self.log.error("DB Error saving task destination %r"%msg_id, exc_info=True)
768 self.log.error("DB Error saving task destination %r"%msg_id, exc_info=True)
769
769
770
770
771 def mia_task_request(self, idents, msg):
771 def mia_task_request(self, idents, msg):
772 raise NotImplementedError
772 raise NotImplementedError
773 client_id = idents[0]
773 client_id = idents[0]
774 # content = dict(mia=self.mia,status='ok')
774 # content = dict(mia=self.mia,status='ok')
775 # self.session.send('mia_reply', content=content, idents=client_id)
775 # self.session.send('mia_reply', content=content, idents=client_id)
776
776
777
777
778 #--------------------- IOPub Traffic ------------------------------
778 #--------------------- IOPub Traffic ------------------------------
779
779
780 def save_iopub_message(self, topics, msg):
780 def save_iopub_message(self, topics, msg):
781 """save an iopub message into the db"""
781 """save an iopub message into the db"""
782 # print (topics)
782 # print (topics)
783 try:
783 try:
784 msg = self.session.unpack_message(msg, content=True)
784 msg = self.session.unpack_message(msg, content=True)
785 except Exception:
785 except Exception:
786 self.log.error("iopub::invalid IOPub message", exc_info=True)
786 self.log.error("iopub::invalid IOPub message", exc_info=True)
787 return
787 return
788
788
789 parent = msg['parent_header']
789 parent = msg['parent_header']
790 if not parent:
790 if not parent:
791 self.log.error("iopub::invalid IOPub message: %r"%msg)
791 self.log.error("iopub::invalid IOPub message: %r"%msg)
792 return
792 return
793 msg_id = parent['msg_id']
793 msg_id = parent['msg_id']
794 msg_type = msg['msg_type']
794 msg_type = msg['msg_type']
795 content = msg['content']
795 content = msg['content']
796
796
797 # ensure msg_id is in db
797 # ensure msg_id is in db
798 try:
798 try:
799 rec = self.db.get_record(msg_id)
799 rec = self.db.get_record(msg_id)
800 except KeyError:
800 except KeyError:
801 rec = empty_record()
801 rec = empty_record()
802 rec['msg_id'] = msg_id
802 rec['msg_id'] = msg_id
803 self.db.add_record(msg_id, rec)
803 self.db.add_record(msg_id, rec)
804 # stream
804 # stream
805 d = {}
805 d = {}
806 if msg_type == 'stream':
806 if msg_type == 'stream':
807 name = content['name']
807 name = content['name']
808 s = rec[name] or ''
808 s = rec[name] or ''
809 d[name] = s + content['data']
809 d[name] = s + content['data']
810
810
811 elif msg_type == 'pyerr':
811 elif msg_type == 'pyerr':
812 d['pyerr'] = content
812 d['pyerr'] = content
813 elif msg_type == 'pyin':
813 elif msg_type == 'pyin':
814 d['pyin'] = content['code']
814 d['pyin'] = content['code']
815 else:
815 else:
816 d[msg_type] = content.get('data', '')
816 d[msg_type] = content.get('data', '')
817
817
818 try:
818 try:
819 self.db.update_record(msg_id, d)
819 self.db.update_record(msg_id, d)
820 except Exception:
820 except Exception:
821 self.log.error("DB Error saving iopub message %r"%msg_id, exc_info=True)
821 self.log.error("DB Error saving iopub message %r"%msg_id, exc_info=True)
822
822
823
823
824
824
825 #-------------------------------------------------------------------------
825 #-------------------------------------------------------------------------
826 # Registration requests
826 # Registration requests
827 #-------------------------------------------------------------------------
827 #-------------------------------------------------------------------------
828
828
829 def connection_request(self, client_id, msg):
829 def connection_request(self, client_id, msg):
830 """Reply with connection addresses for clients."""
830 """Reply with connection addresses for clients."""
831 self.log.info("client::client %r connected"%client_id)
831 self.log.info("client::client %r connected"%client_id)
832 content = dict(status='ok')
832 content = dict(status='ok')
833 content.update(self.client_info)
833 content.update(self.client_info)
834 jsonable = {}
834 jsonable = {}
835 for k,v in self.keytable.iteritems():
835 for k,v in self.keytable.iteritems():
836 if v not in self.dead_engines:
836 if v not in self.dead_engines:
837 jsonable[str(k)] = v.decode()
837 jsonable[str(k)] = v.decode('ascii')
838 content['engines'] = jsonable
838 content['engines'] = jsonable
839 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
839 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
840
840
841 def register_engine(self, reg, msg):
841 def register_engine(self, reg, msg):
842 """Register a new engine."""
842 """Register a new engine."""
843 content = msg['content']
843 content = msg['content']
844 try:
844 try:
845 queue = util.ensure_bytes(content['queue'])
845 queue = util.ensure_bytes(content['queue'])
846 except KeyError:
846 except KeyError:
847 self.log.error("registration::queue not specified", exc_info=True)
847 self.log.error("registration::queue not specified", exc_info=True)
848 return
848 return
849 heart = content.get('heartbeat', None)
849 heart = content.get('heartbeat', None)
850 if heart:
850 if heart:
851 heart = util.ensure_bytes(heart)
851 heart = util.ensure_bytes(heart)
852 """register a new engine, and create the socket(s) necessary"""
852 """register a new engine, and create the socket(s) necessary"""
853 eid = self._next_id
853 eid = self._next_id
854 # print (eid, queue, reg, heart)
854 # print (eid, queue, reg, heart)
855
855
856 self.log.debug("registration::register_engine(%i, %r, %r, %r)"%(eid, queue, reg, heart))
856 self.log.debug("registration::register_engine(%i, %r, %r, %r)"%(eid, queue, reg, heart))
857
857
858 content = dict(id=eid,status='ok')
858 content = dict(id=eid,status='ok')
859 content.update(self.engine_info)
859 content.update(self.engine_info)
860 # check if requesting available IDs:
860 # check if requesting available IDs:
861 if queue in self.by_ident:
861 if queue in self.by_ident:
862 try:
862 try:
863 raise KeyError("queue_id %r in use"%queue)
863 raise KeyError("queue_id %r in use"%queue)
864 except:
864 except:
865 content = error.wrap_exception()
865 content = error.wrap_exception()
866 self.log.error("queue_id %r in use"%queue, exc_info=True)
866 self.log.error("queue_id %r in use"%queue, exc_info=True)
867 elif heart in self.hearts: # need to check unique hearts?
867 elif heart in self.hearts: # need to check unique hearts?
868 try:
868 try:
869 raise KeyError("heart_id %r in use"%heart)
869 raise KeyError("heart_id %r in use"%heart)
870 except:
870 except:
871 self.log.error("heart_id %r in use"%heart, exc_info=True)
871 self.log.error("heart_id %r in use"%heart, exc_info=True)
872 content = error.wrap_exception()
872 content = error.wrap_exception()
873 else:
873 else:
874 for h, pack in self.incoming_registrations.iteritems():
874 for h, pack in self.incoming_registrations.iteritems():
875 if heart == h:
875 if heart == h:
876 try:
876 try:
877 raise KeyError("heart_id %r in use"%heart)
877 raise KeyError("heart_id %r in use"%heart)
878 except:
878 except:
879 self.log.error("heart_id %r in use"%heart, exc_info=True)
879 self.log.error("heart_id %r in use"%heart, exc_info=True)
880 content = error.wrap_exception()
880 content = error.wrap_exception()
881 break
881 break
882 elif queue == pack[1]:
882 elif queue == pack[1]:
883 try:
883 try:
884 raise KeyError("queue_id %r in use"%queue)
884 raise KeyError("queue_id %r in use"%queue)
885 except:
885 except:
886 self.log.error("queue_id %r in use"%queue, exc_info=True)
886 self.log.error("queue_id %r in use"%queue, exc_info=True)
887 content = error.wrap_exception()
887 content = error.wrap_exception()
888 break
888 break
889
889
890 msg = self.session.send(self.query, "registration_reply",
890 msg = self.session.send(self.query, "registration_reply",
891 content=content,
891 content=content,
892 ident=reg)
892 ident=reg)
893
893
894 if content['status'] == 'ok':
894 if content['status'] == 'ok':
895 if heart in self.heartmonitor.hearts:
895 if heart in self.heartmonitor.hearts:
896 # already beating
896 # already beating
897 self.incoming_registrations[heart] = (eid,queue,reg[0],None)
897 self.incoming_registrations[heart] = (eid,queue,reg[0],None)
898 self.finish_registration(heart)
898 self.finish_registration(heart)
899 else:
899 else:
900 purge = lambda : self._purge_stalled_registration(heart)
900 purge = lambda : self._purge_stalled_registration(heart)
901 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
901 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
902 dc.start()
902 dc.start()
903 self.incoming_registrations[heart] = (eid,queue,reg[0],dc)
903 self.incoming_registrations[heart] = (eid,queue,reg[0],dc)
904 else:
904 else:
905 self.log.error("registration::registration %i failed: %r"%(eid, content['evalue']))
905 self.log.error("registration::registration %i failed: %r"%(eid, content['evalue']))
906 return eid
906 return eid
907
907
908 def unregister_engine(self, ident, msg):
908 def unregister_engine(self, ident, msg):
909 """Unregister an engine that explicitly requested to leave."""
909 """Unregister an engine that explicitly requested to leave."""
910 try:
910 try:
911 eid = msg['content']['id']
911 eid = msg['content']['id']
912 except:
912 except:
913 self.log.error("registration::bad engine id for unregistration: %r"%ident, exc_info=True)
913 self.log.error("registration::bad engine id for unregistration: %r"%ident, exc_info=True)
914 return
914 return
915 self.log.info("registration::unregister_engine(%r)"%eid)
915 self.log.info("registration::unregister_engine(%r)"%eid)
916 # print (eid)
916 # print (eid)
917 uuid = self.keytable[eid]
917 uuid = self.keytable[eid]
918 content=dict(id=eid, queue=uuid.decode())
918 content=dict(id=eid, queue=uuid.decode())
919 self.dead_engines.add(uuid)
919 self.dead_engines.add(uuid)
920 # self.ids.remove(eid)
920 # self.ids.remove(eid)
921 # uuid = self.keytable.pop(eid)
921 # uuid = self.keytable.pop(eid)
922 #
922 #
923 # ec = self.engines.pop(eid)
923 # ec = self.engines.pop(eid)
924 # self.hearts.pop(ec.heartbeat)
924 # self.hearts.pop(ec.heartbeat)
925 # self.by_ident.pop(ec.queue)
925 # self.by_ident.pop(ec.queue)
926 # self.completed.pop(eid)
926 # self.completed.pop(eid)
927 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
927 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
928 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
928 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
929 dc.start()
929 dc.start()
930 ############## TODO: HANDLE IT ################
930 ############## TODO: HANDLE IT ################
931
931
932 if self.notifier:
932 if self.notifier:
933 self.session.send(self.notifier, "unregistration_notification", content=content)
933 self.session.send(self.notifier, "unregistration_notification", content=content)
934
934
935 def _handle_stranded_msgs(self, eid, uuid):
935 def _handle_stranded_msgs(self, eid, uuid):
936 """Handle messages known to be on an engine when the engine unregisters.
936 """Handle messages known to be on an engine when the engine unregisters.
937
937
938 It is possible that this will fire prematurely - that is, an engine will
938 It is possible that this will fire prematurely - that is, an engine will
939 go down after completing a result, and the client will be notified
939 go down after completing a result, and the client will be notified
940 that the result failed and later receive the actual result.
940 that the result failed and later receive the actual result.
941 """
941 """
942
942
943 outstanding = self.queues[eid]
943 outstanding = self.queues[eid]
944
944
945 for msg_id in outstanding:
945 for msg_id in outstanding:
946 self.pending.remove(msg_id)
946 self.pending.remove(msg_id)
947 self.all_completed.add(msg_id)
947 self.all_completed.add(msg_id)
948 try:
948 try:
949 raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id))
949 raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id))
950 except:
950 except:
951 content = error.wrap_exception()
951 content = error.wrap_exception()
952 # build a fake header:
952 # build a fake header:
953 header = {}
953 header = {}
954 header['engine'] = uuid
954 header['engine'] = uuid
955 header['date'] = datetime.now()
955 header['date'] = datetime.now()
956 rec = dict(result_content=content, result_header=header, result_buffers=[])
956 rec = dict(result_content=content, result_header=header, result_buffers=[])
957 rec['completed'] = header['date']
957 rec['completed'] = header['date']
958 rec['engine_uuid'] = uuid
958 rec['engine_uuid'] = uuid
959 try:
959 try:
960 self.db.update_record(msg_id, rec)
960 self.db.update_record(msg_id, rec)
961 except Exception:
961 except Exception:
962 self.log.error("DB Error handling stranded msg %r"%msg_id, exc_info=True)
962 self.log.error("DB Error handling stranded msg %r"%msg_id, exc_info=True)
963
963
964
964
965 def finish_registration(self, heart):
965 def finish_registration(self, heart):
966 """Second half of engine registration, called after our HeartMonitor
966 """Second half of engine registration, called after our HeartMonitor
967 has received a beat from the Engine's Heart."""
967 has received a beat from the Engine's Heart."""
968 try:
968 try:
969 (eid,queue,reg,purge) = self.incoming_registrations.pop(heart)
969 (eid,queue,reg,purge) = self.incoming_registrations.pop(heart)
970 except KeyError:
970 except KeyError:
971 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
971 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
972 return
972 return
973 self.log.info("registration::finished registering engine %i:%r"%(eid,queue))
973 self.log.info("registration::finished registering engine %i:%r"%(eid,queue))
974 if purge is not None:
974 if purge is not None:
975 purge.stop()
975 purge.stop()
976 control = queue
976 control = queue
977 self.ids.add(eid)
977 self.ids.add(eid)
978 self.keytable[eid] = queue
978 self.keytable[eid] = queue
979 self.engines[eid] = EngineConnector(id=eid, queue=queue, registration=reg,
979 self.engines[eid] = EngineConnector(id=eid, queue=queue, registration=reg,
980 control=control, heartbeat=heart)
980 control=control, heartbeat=heart)
981 self.by_ident[queue] = eid
981 self.by_ident[queue] = eid
982 self.queues[eid] = list()
982 self.queues[eid] = list()
983 self.tasks[eid] = list()
983 self.tasks[eid] = list()
984 self.completed[eid] = list()
984 self.completed[eid] = list()
985 self.hearts[heart] = eid
985 self.hearts[heart] = eid
986 content = dict(id=eid, queue=self.engines[eid].queue.decode())
986 content = dict(id=eid, queue=self.engines[eid].queue.decode())
987 if self.notifier:
987 if self.notifier:
988 self.session.send(self.notifier, "registration_notification", content=content)
988 self.session.send(self.notifier, "registration_notification", content=content)
989 self.log.info("engine::Engine Connected: %i"%eid)
989 self.log.info("engine::Engine Connected: %i"%eid)
990
990
991 def _purge_stalled_registration(self, heart):
991 def _purge_stalled_registration(self, heart):
992 if heart in self.incoming_registrations:
992 if heart in self.incoming_registrations:
993 eid = self.incoming_registrations.pop(heart)[0]
993 eid = self.incoming_registrations.pop(heart)[0]
994 self.log.info("registration::purging stalled registration: %i"%eid)
994 self.log.info("registration::purging stalled registration: %i"%eid)
995 else:
995 else:
996 pass
996 pass
997
997
998 #-------------------------------------------------------------------------
998 #-------------------------------------------------------------------------
999 # Client Requests
999 # Client Requests
1000 #-------------------------------------------------------------------------
1000 #-------------------------------------------------------------------------
1001
1001
1002 def shutdown_request(self, client_id, msg):
1002 def shutdown_request(self, client_id, msg):
1003 """handle shutdown request."""
1003 """handle shutdown request."""
1004 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1004 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1005 # also notify other clients of shutdown
1005 # also notify other clients of shutdown
1006 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1006 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1007 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1007 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1008 dc.start()
1008 dc.start()
1009
1009
1010 def _shutdown(self):
1010 def _shutdown(self):
1011 self.log.info("hub::hub shutting down.")
1011 self.log.info("hub::hub shutting down.")
1012 time.sleep(0.1)
1012 time.sleep(0.1)
1013 sys.exit(0)
1013 sys.exit(0)
1014
1014
1015
1015
1016 def check_load(self, client_id, msg):
1016 def check_load(self, client_id, msg):
1017 content = msg['content']
1017 content = msg['content']
1018 try:
1018 try:
1019 targets = content['targets']
1019 targets = content['targets']
1020 targets = self._validate_targets(targets)
1020 targets = self._validate_targets(targets)
1021 except:
1021 except:
1022 content = error.wrap_exception()
1022 content = error.wrap_exception()
1023 self.session.send(self.query, "hub_error",
1023 self.session.send(self.query, "hub_error",
1024 content=content, ident=client_id)
1024 content=content, ident=client_id)
1025 return
1025 return
1026
1026
1027 content = dict(status='ok')
1027 content = dict(status='ok')
1028 # loads = {}
1028 # loads = {}
1029 for t in targets:
1029 for t in targets:
1030 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1030 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1031 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1031 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1032
1032
1033
1033
1034 def queue_status(self, client_id, msg):
1034 def queue_status(self, client_id, msg):
1035 """Return the Queue status of one or more targets.
1035 """Return the Queue status of one or more targets.
1036 if verbose: return the msg_ids
1036 if verbose: return the msg_ids
1037 else: return len of each type.
1037 else: return len of each type.
1038 keys: queue (pending MUX jobs)
1038 keys: queue (pending MUX jobs)
1039 tasks (pending Task jobs)
1039 tasks (pending Task jobs)
1040 completed (finished jobs from both queues)"""
1040 completed (finished jobs from both queues)"""
1041 content = msg['content']
1041 content = msg['content']
1042 targets = content['targets']
1042 targets = content['targets']
1043 try:
1043 try:
1044 targets = self._validate_targets(targets)
1044 targets = self._validate_targets(targets)
1045 except:
1045 except:
1046 content = error.wrap_exception()
1046 content = error.wrap_exception()
1047 self.session.send(self.query, "hub_error",
1047 self.session.send(self.query, "hub_error",
1048 content=content, ident=client_id)
1048 content=content, ident=client_id)
1049 return
1049 return
1050 verbose = content.get('verbose', False)
1050 verbose = content.get('verbose', False)
1051 content = dict(status='ok')
1051 content = dict(status='ok')
1052 for t in targets:
1052 for t in targets:
1053 queue = self.queues[t]
1053 queue = self.queues[t]
1054 completed = self.completed[t]
1054 completed = self.completed[t]
1055 tasks = self.tasks[t]
1055 tasks = self.tasks[t]
1056 if not verbose:
1056 if not verbose:
1057 queue = len(queue)
1057 queue = len(queue)
1058 completed = len(completed)
1058 completed = len(completed)
1059 tasks = len(tasks)
1059 tasks = len(tasks)
1060 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1060 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1061 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1061 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1062 # print (content)
1062 # print (content)
1063 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1063 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1064
1064
1065 def purge_results(self, client_id, msg):
1065 def purge_results(self, client_id, msg):
1066 """Purge results from memory. This method is more valuable before we move
1066 """Purge results from memory. This method is more valuable before we move
1067 to a DB based message storage mechanism."""
1067 to a DB based message storage mechanism."""
1068 content = msg['content']
1068 content = msg['content']
1069 self.log.info("Dropping records with %s", content)
1069 self.log.info("Dropping records with %s", content)
1070 msg_ids = content.get('msg_ids', [])
1070 msg_ids = content.get('msg_ids', [])
1071 reply = dict(status='ok')
1071 reply = dict(status='ok')
1072 if msg_ids == 'all':
1072 if msg_ids == 'all':
1073 try:
1073 try:
1074 self.db.drop_matching_records(dict(completed={'$ne':None}))
1074 self.db.drop_matching_records(dict(completed={'$ne':None}))
1075 except Exception:
1075 except Exception:
1076 reply = error.wrap_exception()
1076 reply = error.wrap_exception()
1077 else:
1077 else:
1078 pending = filter(lambda m: m in self.pending, msg_ids)
1078 pending = filter(lambda m: m in self.pending, msg_ids)
1079 if pending:
1079 if pending:
1080 try:
1080 try:
1081 raise IndexError("msg pending: %r"%pending[0])
1081 raise IndexError("msg pending: %r"%pending[0])
1082 except:
1082 except:
1083 reply = error.wrap_exception()
1083 reply = error.wrap_exception()
1084 else:
1084 else:
1085 try:
1085 try:
1086 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1086 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1087 except Exception:
1087 except Exception:
1088 reply = error.wrap_exception()
1088 reply = error.wrap_exception()
1089
1089
1090 if reply['status'] == 'ok':
1090 if reply['status'] == 'ok':
1091 eids = content.get('engine_ids', [])
1091 eids = content.get('engine_ids', [])
1092 for eid in eids:
1092 for eid in eids:
1093 if eid not in self.engines:
1093 if eid not in self.engines:
1094 try:
1094 try:
1095 raise IndexError("No such engine: %i"%eid)
1095 raise IndexError("No such engine: %i"%eid)
1096 except:
1096 except:
1097 reply = error.wrap_exception()
1097 reply = error.wrap_exception()
1098 break
1098 break
1099 uid = self.engines[eid].queue
1099 uid = self.engines[eid].queue
1100 try:
1100 try:
1101 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1101 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1102 except Exception:
1102 except Exception:
1103 reply = error.wrap_exception()
1103 reply = error.wrap_exception()
1104 break
1104 break
1105
1105
1106 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1106 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1107
1107
1108 def resubmit_task(self, client_id, msg):
1108 def resubmit_task(self, client_id, msg):
1109 """Resubmit one or more tasks."""
1109 """Resubmit one or more tasks."""
1110 def finish(reply):
1110 def finish(reply):
1111 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1111 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1112
1112
1113 content = msg['content']
1113 content = msg['content']
1114 msg_ids = content['msg_ids']
1114 msg_ids = content['msg_ids']
1115 reply = dict(status='ok')
1115 reply = dict(status='ok')
1116 try:
1116 try:
1117 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1117 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1118 'header', 'content', 'buffers'])
1118 'header', 'content', 'buffers'])
1119 except Exception:
1119 except Exception:
1120 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1120 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1121 return finish(error.wrap_exception())
1121 return finish(error.wrap_exception())
1122
1122
1123 # validate msg_ids
1123 # validate msg_ids
1124 found_ids = [ rec['msg_id'] for rec in records ]
1124 found_ids = [ rec['msg_id'] for rec in records ]
1125 invalid_ids = filter(lambda m: m in self.pending, found_ids)
1125 invalid_ids = filter(lambda m: m in self.pending, found_ids)
1126 if len(records) > len(msg_ids):
1126 if len(records) > len(msg_ids):
1127 try:
1127 try:
1128 raise RuntimeError("DB appears to be in an inconsistent state."
1128 raise RuntimeError("DB appears to be in an inconsistent state."
1129 "More matching records were found than should exist")
1129 "More matching records were found than should exist")
1130 except Exception:
1130 except Exception:
1131 return finish(error.wrap_exception())
1131 return finish(error.wrap_exception())
1132 elif len(records) < len(msg_ids):
1132 elif len(records) < len(msg_ids):
1133 missing = [ m for m in msg_ids if m not in found_ids ]
1133 missing = [ m for m in msg_ids if m not in found_ids ]
1134 try:
1134 try:
1135 raise KeyError("No such msg(s): %r"%missing)
1135 raise KeyError("No such msg(s): %r"%missing)
1136 except KeyError:
1136 except KeyError:
1137 return finish(error.wrap_exception())
1137 return finish(error.wrap_exception())
1138 elif invalid_ids:
1138 elif invalid_ids:
1139 msg_id = invalid_ids[0]
1139 msg_id = invalid_ids[0]
1140 try:
1140 try:
1141 raise ValueError("Task %r appears to be inflight"%(msg_id))
1141 raise ValueError("Task %r appears to be inflight"%(msg_id))
1142 except Exception:
1142 except Exception:
1143 return finish(error.wrap_exception())
1143 return finish(error.wrap_exception())
1144
1144
1145 # clear the existing records
1145 # clear the existing records
1146 now = datetime.now()
1146 now = datetime.now()
1147 rec = empty_record()
1147 rec = empty_record()
1148 map(rec.pop, ['msg_id', 'header', 'content', 'buffers', 'submitted'])
1148 map(rec.pop, ['msg_id', 'header', 'content', 'buffers', 'submitted'])
1149 rec['resubmitted'] = now
1149 rec['resubmitted'] = now
1150 rec['queue'] = 'task'
1150 rec['queue'] = 'task'
1151 rec['client_uuid'] = client_id[0]
1151 rec['client_uuid'] = client_id[0]
1152 try:
1152 try:
1153 for msg_id in msg_ids:
1153 for msg_id in msg_ids:
1154 self.all_completed.discard(msg_id)
1154 self.all_completed.discard(msg_id)
1155 self.db.update_record(msg_id, rec)
1155 self.db.update_record(msg_id, rec)
1156 except Exception:
1156 except Exception:
1157 self.log.error('db::db error upating record', exc_info=True)
1157 self.log.error('db::db error upating record', exc_info=True)
1158 reply = error.wrap_exception()
1158 reply = error.wrap_exception()
1159 else:
1159 else:
1160 # send the messages
1160 # send the messages
1161 for rec in records:
1161 for rec in records:
1162 header = rec['header']
1162 header = rec['header']
1163 # include resubmitted in header to prevent digest collision
1163 # include resubmitted in header to prevent digest collision
1164 header['resubmitted'] = now
1164 header['resubmitted'] = now
1165 msg = self.session.msg(header['msg_type'])
1165 msg = self.session.msg(header['msg_type'])
1166 msg['content'] = rec['content']
1166 msg['content'] = rec['content']
1167 msg['header'] = header
1167 msg['header'] = header
1168 msg['msg_id'] = rec['msg_id']
1168 msg['msg_id'] = rec['msg_id']
1169 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1169 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1170
1170
1171 finish(dict(status='ok'))
1171 finish(dict(status='ok'))
1172
1172
1173
1173
1174 def _extract_record(self, rec):
1174 def _extract_record(self, rec):
1175 """decompose a TaskRecord dict into subsection of reply for get_result"""
1175 """decompose a TaskRecord dict into subsection of reply for get_result"""
1176 io_dict = {}
1176 io_dict = {}
1177 for key in 'pyin pyout pyerr stdout stderr'.split():
1177 for key in 'pyin pyout pyerr stdout stderr'.split():
1178 io_dict[key] = rec[key]
1178 io_dict[key] = rec[key]
1179 content = { 'result_content': rec['result_content'],
1179 content = { 'result_content': rec['result_content'],
1180 'header': rec['header'],
1180 'header': rec['header'],
1181 'result_header' : rec['result_header'],
1181 'result_header' : rec['result_header'],
1182 'io' : io_dict,
1182 'io' : io_dict,
1183 }
1183 }
1184 if rec['result_buffers']:
1184 if rec['result_buffers']:
1185 buffers = map(bytes, rec['result_buffers'])
1185 buffers = map(bytes, rec['result_buffers'])
1186 else:
1186 else:
1187 buffers = []
1187 buffers = []
1188
1188
1189 return content, buffers
1189 return content, buffers
1190
1190
1191 def get_results(self, client_id, msg):
1191 def get_results(self, client_id, msg):
1192 """Get the result of 1 or more messages."""
1192 """Get the result of 1 or more messages."""
1193 content = msg['content']
1193 content = msg['content']
1194 msg_ids = sorted(set(content['msg_ids']))
1194 msg_ids = sorted(set(content['msg_ids']))
1195 statusonly = content.get('status_only', False)
1195 statusonly = content.get('status_only', False)
1196 pending = []
1196 pending = []
1197 completed = []
1197 completed = []
1198 content = dict(status='ok')
1198 content = dict(status='ok')
1199 content['pending'] = pending
1199 content['pending'] = pending
1200 content['completed'] = completed
1200 content['completed'] = completed
1201 buffers = []
1201 buffers = []
1202 if not statusonly:
1202 if not statusonly:
1203 try:
1203 try:
1204 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1204 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1205 # turn match list into dict, for faster lookup
1205 # turn match list into dict, for faster lookup
1206 records = {}
1206 records = {}
1207 for rec in matches:
1207 for rec in matches:
1208 records[rec['msg_id']] = rec
1208 records[rec['msg_id']] = rec
1209 except Exception:
1209 except Exception:
1210 content = error.wrap_exception()
1210 content = error.wrap_exception()
1211 self.session.send(self.query, "result_reply", content=content,
1211 self.session.send(self.query, "result_reply", content=content,
1212 parent=msg, ident=client_id)
1212 parent=msg, ident=client_id)
1213 return
1213 return
1214 else:
1214 else:
1215 records = {}
1215 records = {}
1216 for msg_id in msg_ids:
1216 for msg_id in msg_ids:
1217 if msg_id in self.pending:
1217 if msg_id in self.pending:
1218 pending.append(msg_id)
1218 pending.append(msg_id)
1219 elif msg_id in self.all_completed:
1219 elif msg_id in self.all_completed:
1220 completed.append(msg_id)
1220 completed.append(msg_id)
1221 if not statusonly:
1221 if not statusonly:
1222 c,bufs = self._extract_record(records[msg_id])
1222 c,bufs = self._extract_record(records[msg_id])
1223 content[msg_id] = c
1223 content[msg_id] = c
1224 buffers.extend(bufs)
1224 buffers.extend(bufs)
1225 elif msg_id in records:
1225 elif msg_id in records:
1226 if rec['completed']:
1226 if rec['completed']:
1227 completed.append(msg_id)
1227 completed.append(msg_id)
1228 c,bufs = self._extract_record(records[msg_id])
1228 c,bufs = self._extract_record(records[msg_id])
1229 content[msg_id] = c
1229 content[msg_id] = c
1230 buffers.extend(bufs)
1230 buffers.extend(bufs)
1231 else:
1231 else:
1232 pending.append(msg_id)
1232 pending.append(msg_id)
1233 else:
1233 else:
1234 try:
1234 try:
1235 raise KeyError('No such message: '+msg_id)
1235 raise KeyError('No such message: '+msg_id)
1236 except:
1236 except:
1237 content = error.wrap_exception()
1237 content = error.wrap_exception()
1238 break
1238 break
1239 self.session.send(self.query, "result_reply", content=content,
1239 self.session.send(self.query, "result_reply", content=content,
1240 parent=msg, ident=client_id,
1240 parent=msg, ident=client_id,
1241 buffers=buffers)
1241 buffers=buffers)
1242
1242
1243 def get_history(self, client_id, msg):
1243 def get_history(self, client_id, msg):
1244 """Get a list of all msg_ids in our DB records"""
1244 """Get a list of all msg_ids in our DB records"""
1245 try:
1245 try:
1246 msg_ids = self.db.get_history()
1246 msg_ids = self.db.get_history()
1247 except Exception as e:
1247 except Exception as e:
1248 content = error.wrap_exception()
1248 content = error.wrap_exception()
1249 else:
1249 else:
1250 content = dict(status='ok', history=msg_ids)
1250 content = dict(status='ok', history=msg_ids)
1251
1251
1252 self.session.send(self.query, "history_reply", content=content,
1252 self.session.send(self.query, "history_reply", content=content,
1253 parent=msg, ident=client_id)
1253 parent=msg, ident=client_id)
1254
1254
1255 def db_query(self, client_id, msg):
1255 def db_query(self, client_id, msg):
1256 """Perform a raw query on the task record database."""
1256 """Perform a raw query on the task record database."""
1257 content = msg['content']
1257 content = msg['content']
1258 query = content.get('query', {})
1258 query = content.get('query', {})
1259 keys = content.get('keys', None)
1259 keys = content.get('keys', None)
1260 buffers = []
1260 buffers = []
1261 empty = list()
1261 empty = list()
1262 try:
1262 try:
1263 records = self.db.find_records(query, keys)
1263 records = self.db.find_records(query, keys)
1264 except Exception as e:
1264 except Exception as e:
1265 content = error.wrap_exception()
1265 content = error.wrap_exception()
1266 else:
1266 else:
1267 # extract buffers from reply content:
1267 # extract buffers from reply content:
1268 if keys is not None:
1268 if keys is not None:
1269 buffer_lens = [] if 'buffers' in keys else None
1269 buffer_lens = [] if 'buffers' in keys else None
1270 result_buffer_lens = [] if 'result_buffers' in keys else None
1270 result_buffer_lens = [] if 'result_buffers' in keys else None
1271 else:
1271 else:
1272 buffer_lens = []
1272 buffer_lens = []
1273 result_buffer_lens = []
1273 result_buffer_lens = []
1274
1274
1275 for rec in records:
1275 for rec in records:
1276 # buffers may be None, so double check
1276 # buffers may be None, so double check
1277 if buffer_lens is not None:
1277 if buffer_lens is not None:
1278 b = rec.pop('buffers', empty) or empty
1278 b = rec.pop('buffers', empty) or empty
1279 buffer_lens.append(len(b))
1279 buffer_lens.append(len(b))
1280 buffers.extend(b)
1280 buffers.extend(b)
1281 if result_buffer_lens is not None:
1281 if result_buffer_lens is not None:
1282 rb = rec.pop('result_buffers', empty) or empty
1282 rb = rec.pop('result_buffers', empty) or empty
1283 result_buffer_lens.append(len(rb))
1283 result_buffer_lens.append(len(rb))
1284 buffers.extend(rb)
1284 buffers.extend(rb)
1285 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1285 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1286 result_buffer_lens=result_buffer_lens)
1286 result_buffer_lens=result_buffer_lens)
1287 # self.log.debug (content)
1287 # self.log.debug (content)
1288 self.session.send(self.query, "db_reply", content=content,
1288 self.session.send(self.query, "db_reply", content=content,
1289 parent=msg, ident=client_id,
1289 parent=msg, ident=client_id,
1290 buffers=buffers)
1290 buffers=buffers)
1291
1291
@@ -1,714 +1,714
1 """The Python scheduler for rich scheduling.
1 """The Python scheduler for rich scheduling.
2
2
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
5 Python Scheduler exists.
5 Python Scheduler exists.
6
6
7 Authors:
7 Authors:
8
8
9 * Min RK
9 * Min RK
10 """
10 """
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12 # Copyright (C) 2010-2011 The IPython Development Team
12 # Copyright (C) 2010-2011 The IPython Development Team
13 #
13 #
14 # Distributed under the terms of the BSD License. The full license is in
14 # Distributed under the terms of the BSD License. The full license is in
15 # the file COPYING, distributed as part of this software.
15 # the file COPYING, distributed as part of this software.
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17
17
18 #----------------------------------------------------------------------
18 #----------------------------------------------------------------------
19 # Imports
19 # Imports
20 #----------------------------------------------------------------------
20 #----------------------------------------------------------------------
21
21
22 from __future__ import print_function
22 from __future__ import print_function
23
23
24 import logging
24 import logging
25 import sys
25 import sys
26
26
27 from datetime import datetime, timedelta
27 from datetime import datetime, timedelta
28 from random import randint, random
28 from random import randint, random
29 from types import FunctionType
29 from types import FunctionType
30
30
31 try:
31 try:
32 import numpy
32 import numpy
33 except ImportError:
33 except ImportError:
34 numpy = None
34 numpy = None
35
35
36 import zmq
36 import zmq
37 from zmq.eventloop import ioloop, zmqstream
37 from zmq.eventloop import ioloop, zmqstream
38
38
39 # local imports
39 # local imports
40 from IPython.external.decorator import decorator
40 from IPython.external.decorator import decorator
41 from IPython.config.application import Application
41 from IPython.config.application import Application
42 from IPython.config.loader import Config
42 from IPython.config.loader import Config
43 from IPython.utils.traitlets import Instance, Dict, List, Set, Int, Enum, CBytes
43 from IPython.utils.traitlets import Instance, Dict, List, Set, Int, Enum, CBytes
44
44
45 from IPython.parallel import error
45 from IPython.parallel import error
46 from IPython.parallel.factory import SessionFactory
46 from IPython.parallel.factory import SessionFactory
47 from IPython.parallel.util import connect_logger, local_logger, ensure_bytes
47 from IPython.parallel.util import connect_logger, local_logger, ensure_bytes
48
48
49 from .dependency import Dependency
49 from .dependency import Dependency
50
50
51 @decorator
51 @decorator
52 def logged(f,self,*args,**kwargs):
52 def logged(f,self,*args,**kwargs):
53 # print ("#--------------------")
53 # print ("#--------------------")
54 self.log.debug("scheduler::%s(*%s,**%s)", f.func_name, args, kwargs)
54 self.log.debug("scheduler::%s(*%s,**%s)", f.func_name, args, kwargs)
55 # print ("#--")
55 # print ("#--")
56 return f(self,*args, **kwargs)
56 return f(self,*args, **kwargs)
57
57
58 #----------------------------------------------------------------------
58 #----------------------------------------------------------------------
59 # Chooser functions
59 # Chooser functions
60 #----------------------------------------------------------------------
60 #----------------------------------------------------------------------
61
61
62 def plainrandom(loads):
62 def plainrandom(loads):
63 """Plain random pick."""
63 """Plain random pick."""
64 n = len(loads)
64 n = len(loads)
65 return randint(0,n-1)
65 return randint(0,n-1)
66
66
67 def lru(loads):
67 def lru(loads):
68 """Always pick the front of the line.
68 """Always pick the front of the line.
69
69
70 The content of `loads` is ignored.
70 The content of `loads` is ignored.
71
71
72 Assumes LRU ordering of loads, with oldest first.
72 Assumes LRU ordering of loads, with oldest first.
73 """
73 """
74 return 0
74 return 0
75
75
76 def twobin(loads):
76 def twobin(loads):
77 """Pick two at random, use the LRU of the two.
77 """Pick two at random, use the LRU of the two.
78
78
79 The content of loads is ignored.
79 The content of loads is ignored.
80
80
81 Assumes LRU ordering of loads, with oldest first.
81 Assumes LRU ordering of loads, with oldest first.
82 """
82 """
83 n = len(loads)
83 n = len(loads)
84 a = randint(0,n-1)
84 a = randint(0,n-1)
85 b = randint(0,n-1)
85 b = randint(0,n-1)
86 return min(a,b)
86 return min(a,b)
87
87
88 def weighted(loads):
88 def weighted(loads):
89 """Pick two at random using inverse load as weight.
89 """Pick two at random using inverse load as weight.
90
90
91 Return the less loaded of the two.
91 Return the less loaded of the two.
92 """
92 """
93 # weight 0 a million times more than 1:
93 # weight 0 a million times more than 1:
94 weights = 1./(1e-6+numpy.array(loads))
94 weights = 1./(1e-6+numpy.array(loads))
95 sums = weights.cumsum()
95 sums = weights.cumsum()
96 t = sums[-1]
96 t = sums[-1]
97 x = random()*t
97 x = random()*t
98 y = random()*t
98 y = random()*t
99 idx = 0
99 idx = 0
100 idy = 0
100 idy = 0
101 while sums[idx] < x:
101 while sums[idx] < x:
102 idx += 1
102 idx += 1
103 while sums[idy] < y:
103 while sums[idy] < y:
104 idy += 1
104 idy += 1
105 if weights[idy] > weights[idx]:
105 if weights[idy] > weights[idx]:
106 return idy
106 return idy
107 else:
107 else:
108 return idx
108 return idx
109
109
110 def leastload(loads):
110 def leastload(loads):
111 """Always choose the lowest load.
111 """Always choose the lowest load.
112
112
113 If the lowest load occurs more than once, the first
113 If the lowest load occurs more than once, the first
114 occurance will be used. If loads has LRU ordering, this means
114 occurance will be used. If loads has LRU ordering, this means
115 the LRU of those with the lowest load is chosen.
115 the LRU of those with the lowest load is chosen.
116 """
116 """
117 return loads.index(min(loads))
117 return loads.index(min(loads))
118
118
119 #---------------------------------------------------------------------
119 #---------------------------------------------------------------------
120 # Classes
120 # Classes
121 #---------------------------------------------------------------------
121 #---------------------------------------------------------------------
122 # store empty default dependency:
122 # store empty default dependency:
123 MET = Dependency([])
123 MET = Dependency([])
124
124
125 class TaskScheduler(SessionFactory):
125 class TaskScheduler(SessionFactory):
126 """Python TaskScheduler object.
126 """Python TaskScheduler object.
127
127
128 This is the simplest object that supports msg_id based
128 This is the simplest object that supports msg_id based
129 DAG dependencies. *Only* task msg_ids are checked, not
129 DAG dependencies. *Only* task msg_ids are checked, not
130 msg_ids of jobs submitted via the MUX queue.
130 msg_ids of jobs submitted via the MUX queue.
131
131
132 """
132 """
133
133
134 hwm = Int(0, config=True, shortname='hwm',
134 hwm = Int(0, config=True, shortname='hwm',
135 help="""specify the High Water Mark (HWM) for the downstream
135 help="""specify the High Water Mark (HWM) for the downstream
136 socket in the Task scheduler. This is the maximum number
136 socket in the Task scheduler. This is the maximum number
137 of allowed outstanding tasks on each engine."""
137 of allowed outstanding tasks on each engine."""
138 )
138 )
139 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
139 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
140 'leastload', config=True, shortname='scheme', allow_none=False,
140 'leastload', config=True, shortname='scheme', allow_none=False,
141 help="""select the task scheduler scheme [default: Python LRU]
141 help="""select the task scheduler scheme [default: Python LRU]
142 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
142 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
143 )
143 )
144 def _scheme_name_changed(self, old, new):
144 def _scheme_name_changed(self, old, new):
145 self.log.debug("Using scheme %r"%new)
145 self.log.debug("Using scheme %r"%new)
146 self.scheme = globals()[new]
146 self.scheme = globals()[new]
147
147
148 # input arguments:
148 # input arguments:
149 scheme = Instance(FunctionType) # function for determining the destination
149 scheme = Instance(FunctionType) # function for determining the destination
150 def _scheme_default(self):
150 def _scheme_default(self):
151 return leastload
151 return leastload
152 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
152 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
153 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
153 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
154 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
154 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
155 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
155 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
156
156
157 # internals:
157 # internals:
158 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
158 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
159 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
159 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
160 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
160 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
161 depending = Dict() # dict by msg_id of (msg_id, raw_msg, after, follow)
161 depending = Dict() # dict by msg_id of (msg_id, raw_msg, after, follow)
162 pending = Dict() # dict by engine_uuid of submitted tasks
162 pending = Dict() # dict by engine_uuid of submitted tasks
163 completed = Dict() # dict by engine_uuid of completed tasks
163 completed = Dict() # dict by engine_uuid of completed tasks
164 failed = Dict() # dict by engine_uuid of failed tasks
164 failed = Dict() # dict by engine_uuid of failed tasks
165 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
165 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
166 clients = Dict() # dict by msg_id for who submitted the task
166 clients = Dict() # dict by msg_id for who submitted the task
167 targets = List() # list of target IDENTs
167 targets = List() # list of target IDENTs
168 loads = List() # list of engine loads
168 loads = List() # list of engine loads
169 # full = Set() # set of IDENTs that have HWM outstanding tasks
169 # full = Set() # set of IDENTs that have HWM outstanding tasks
170 all_completed = Set() # set of all completed tasks
170 all_completed = Set() # set of all completed tasks
171 all_failed = Set() # set of all failed tasks
171 all_failed = Set() # set of all failed tasks
172 all_done = Set() # set of all finished tasks=union(completed,failed)
172 all_done = Set() # set of all finished tasks=union(completed,failed)
173 all_ids = Set() # set of all submitted task IDs
173 all_ids = Set() # set of all submitted task IDs
174 blacklist = Dict() # dict by msg_id of locations where a job has encountered UnmetDependency
174 blacklist = Dict() # dict by msg_id of locations where a job has encountered UnmetDependency
175 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
175 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
176
176
177 ident = CBytes() # ZMQ identity. This should just be self.session.session
177 ident = CBytes() # ZMQ identity. This should just be self.session.session
178 # but ensure Bytes
178 # but ensure Bytes
179 def _ident_default(self):
179 def _ident_default(self):
180 return ensure_bytes(self.session.session)
180 return ensure_bytes(self.session.session)
181
181
182 def start(self):
182 def start(self):
183 self.engine_stream.on_recv(self.dispatch_result, copy=False)
183 self.engine_stream.on_recv(self.dispatch_result, copy=False)
184 self._notification_handlers = dict(
184 self._notification_handlers = dict(
185 registration_notification = self._register_engine,
185 registration_notification = self._register_engine,
186 unregistration_notification = self._unregister_engine
186 unregistration_notification = self._unregister_engine
187 )
187 )
188 self.notifier_stream.on_recv(self.dispatch_notification)
188 self.notifier_stream.on_recv(self.dispatch_notification)
189 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
189 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
190 self.auditor.start()
190 self.auditor.start()
191 self.log.info("Scheduler started [%s]"%self.scheme_name)
191 self.log.info("Scheduler started [%s]"%self.scheme_name)
192
192
193 def resume_receiving(self):
193 def resume_receiving(self):
194 """Resume accepting jobs."""
194 """Resume accepting jobs."""
195 self.client_stream.on_recv(self.dispatch_submission, copy=False)
195 self.client_stream.on_recv(self.dispatch_submission, copy=False)
196
196
197 def stop_receiving(self):
197 def stop_receiving(self):
198 """Stop accepting jobs while there are no engines.
198 """Stop accepting jobs while there are no engines.
199 Leave them in the ZMQ queue."""
199 Leave them in the ZMQ queue."""
200 self.client_stream.on_recv(None)
200 self.client_stream.on_recv(None)
201
201
202 #-----------------------------------------------------------------------
202 #-----------------------------------------------------------------------
203 # [Un]Registration Handling
203 # [Un]Registration Handling
204 #-----------------------------------------------------------------------
204 #-----------------------------------------------------------------------
205
205
206 def dispatch_notification(self, msg):
206 def dispatch_notification(self, msg):
207 """dispatch register/unregister events."""
207 """dispatch register/unregister events."""
208 try:
208 try:
209 idents,msg = self.session.feed_identities(msg)
209 idents,msg = self.session.feed_identities(msg)
210 except ValueError:
210 except ValueError:
211 self.log.warn("task::Invalid Message: %r",msg)
211 self.log.warn("task::Invalid Message: %r",msg)
212 return
212 return
213 try:
213 try:
214 msg = self.session.unpack_message(msg)
214 msg = self.session.unpack_message(msg)
215 except ValueError:
215 except ValueError:
216 self.log.warn("task::Unauthorized message from: %r"%idents)
216 self.log.warn("task::Unauthorized message from: %r"%idents)
217 return
217 return
218
218
219 msg_type = msg['msg_type']
219 msg_type = msg['msg_type']
220
220
221 handler = self._notification_handlers.get(msg_type, None)
221 handler = self._notification_handlers.get(msg_type, None)
222 if handler is None:
222 if handler is None:
223 self.log.error("Unhandled message type: %r"%msg_type)
223 self.log.error("Unhandled message type: %r"%msg_type)
224 else:
224 else:
225 try:
225 try:
226 handler(ensure_bytes(msg['content']['queue']))
226 handler(ensure_bytes(msg['content']['queue']))
227 except Exception:
227 except Exception:
228 self.log.error("task::Invalid notification msg: %r",msg)
228 self.log.error("task::Invalid notification msg: %r",msg)
229
229
230 def _register_engine(self, uid):
230 def _register_engine(self, uid):
231 """New engine with ident `uid` became available."""
231 """New engine with ident `uid` became available."""
232 # head of the line:
232 # head of the line:
233 self.targets.insert(0,uid)
233 self.targets.insert(0,uid)
234 self.loads.insert(0,0)
234 self.loads.insert(0,0)
235
235
236 # initialize sets
236 # initialize sets
237 self.completed[uid] = set()
237 self.completed[uid] = set()
238 self.failed[uid] = set()
238 self.failed[uid] = set()
239 self.pending[uid] = {}
239 self.pending[uid] = {}
240 if len(self.targets) == 1:
240 if len(self.targets) == 1:
241 self.resume_receiving()
241 self.resume_receiving()
242 # rescan the graph:
242 # rescan the graph:
243 self.update_graph(None)
243 self.update_graph(None)
244
244
245 def _unregister_engine(self, uid):
245 def _unregister_engine(self, uid):
246 """Existing engine with ident `uid` became unavailable."""
246 """Existing engine with ident `uid` became unavailable."""
247 if len(self.targets) == 1:
247 if len(self.targets) == 1:
248 # this was our only engine
248 # this was our only engine
249 self.stop_receiving()
249 self.stop_receiving()
250
250
251 # handle any potentially finished tasks:
251 # handle any potentially finished tasks:
252 self.engine_stream.flush()
252 self.engine_stream.flush()
253
253
254 # don't pop destinations, because they might be used later
254 # don't pop destinations, because they might be used later
255 # map(self.destinations.pop, self.completed.pop(uid))
255 # map(self.destinations.pop, self.completed.pop(uid))
256 # map(self.destinations.pop, self.failed.pop(uid))
256 # map(self.destinations.pop, self.failed.pop(uid))
257
257
258 # prevent this engine from receiving work
258 # prevent this engine from receiving work
259 idx = self.targets.index(uid)
259 idx = self.targets.index(uid)
260 self.targets.pop(idx)
260 self.targets.pop(idx)
261 self.loads.pop(idx)
261 self.loads.pop(idx)
262
262
263 # wait 5 seconds before cleaning up pending jobs, since the results might
263 # wait 5 seconds before cleaning up pending jobs, since the results might
264 # still be incoming
264 # still be incoming
265 if self.pending[uid]:
265 if self.pending[uid]:
266 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
266 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
267 dc.start()
267 dc.start()
268 else:
268 else:
269 self.completed.pop(uid)
269 self.completed.pop(uid)
270 self.failed.pop(uid)
270 self.failed.pop(uid)
271
271
272
272
273 def handle_stranded_tasks(self, engine):
273 def handle_stranded_tasks(self, engine):
274 """Deal with jobs resident in an engine that died."""
274 """Deal with jobs resident in an engine that died."""
275 lost = self.pending[engine]
275 lost = self.pending[engine]
276 for msg_id in lost.keys():
276 for msg_id in lost.keys():
277 if msg_id not in self.pending[engine]:
277 if msg_id not in self.pending[engine]:
278 # prevent double-handling of messages
278 # prevent double-handling of messages
279 continue
279 continue
280
280
281 raw_msg = lost[msg_id][0]
281 raw_msg = lost[msg_id][0]
282 idents,msg = self.session.feed_identities(raw_msg, copy=False)
282 idents,msg = self.session.feed_identities(raw_msg, copy=False)
283 parent = self.session.unpack(msg[1].bytes)
283 parent = self.session.unpack(msg[1].bytes)
284 idents = [engine, idents[0]]
284 idents = [engine, idents[0]]
285
285
286 # build fake error reply
286 # build fake error reply
287 try:
287 try:
288 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
288 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
289 except:
289 except:
290 content = error.wrap_exception()
290 content = error.wrap_exception()
291 msg = self.session.msg('apply_reply', content, parent=parent, subheader={'status':'error'})
291 msg = self.session.msg('apply_reply', content, parent=parent, subheader={'status':'error'})
292 raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
292 raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
293 # and dispatch it
293 # and dispatch it
294 self.dispatch_result(raw_reply)
294 self.dispatch_result(raw_reply)
295
295
296 # finally scrub completed/failed lists
296 # finally scrub completed/failed lists
297 self.completed.pop(engine)
297 self.completed.pop(engine)
298 self.failed.pop(engine)
298 self.failed.pop(engine)
299
299
300
300
301 #-----------------------------------------------------------------------
301 #-----------------------------------------------------------------------
302 # Job Submission
302 # Job Submission
303 #-----------------------------------------------------------------------
303 #-----------------------------------------------------------------------
304 def dispatch_submission(self, raw_msg):
304 def dispatch_submission(self, raw_msg):
305 """Dispatch job submission to appropriate handlers."""
305 """Dispatch job submission to appropriate handlers."""
306 # ensure targets up to date:
306 # ensure targets up to date:
307 self.notifier_stream.flush()
307 self.notifier_stream.flush()
308 try:
308 try:
309 idents, msg = self.session.feed_identities(raw_msg, copy=False)
309 idents, msg = self.session.feed_identities(raw_msg, copy=False)
310 msg = self.session.unpack_message(msg, content=False, copy=False)
310 msg = self.session.unpack_message(msg, content=False, copy=False)
311 except Exception:
311 except Exception:
312 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
312 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
313 return
313 return
314
314
315
315
316 # send to monitor
316 # send to monitor
317 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
317 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
318
318
319 header = msg['header']
319 header = msg['header']
320 msg_id = header['msg_id']
320 msg_id = header['msg_id']
321 self.all_ids.add(msg_id)
321 self.all_ids.add(msg_id)
322
322
323 # get targets as a set of bytes objects
323 # get targets as a set of bytes objects
324 # from a list of unicode objects
324 # from a list of unicode objects
325 targets = header.get('targets', [])
325 targets = header.get('targets', [])
326 targets = map(ensure_bytes, targets)
326 targets = map(ensure_bytes, targets)
327 targets = set(targets)
327 targets = set(targets)
328
328
329 retries = header.get('retries', 0)
329 retries = header.get('retries', 0)
330 self.retries[msg_id] = retries
330 self.retries[msg_id] = retries
331
331
332 # time dependencies
332 # time dependencies
333 after = header.get('after', None)
333 after = header.get('after', None)
334 if after:
334 if after:
335 after = Dependency(after)
335 after = Dependency(after)
336 if after.all:
336 if after.all:
337 if after.success:
337 if after.success:
338 after = Dependency(after.difference(self.all_completed),
338 after = Dependency(after.difference(self.all_completed),
339 success=after.success,
339 success=after.success,
340 failure=after.failure,
340 failure=after.failure,
341 all=after.all,
341 all=after.all,
342 )
342 )
343 if after.failure:
343 if after.failure:
344 after = Dependency(after.difference(self.all_failed),
344 after = Dependency(after.difference(self.all_failed),
345 success=after.success,
345 success=after.success,
346 failure=after.failure,
346 failure=after.failure,
347 all=after.all,
347 all=after.all,
348 )
348 )
349 if after.check(self.all_completed, self.all_failed):
349 if after.check(self.all_completed, self.all_failed):
350 # recast as empty set, if `after` already met,
350 # recast as empty set, if `after` already met,
351 # to prevent unnecessary set comparisons
351 # to prevent unnecessary set comparisons
352 after = MET
352 after = MET
353 else:
353 else:
354 after = MET
354 after = MET
355
355
356 # location dependencies
356 # location dependencies
357 follow = Dependency(header.get('follow', []))
357 follow = Dependency(header.get('follow', []))
358
358
359 # turn timeouts into datetime objects:
359 # turn timeouts into datetime objects:
360 timeout = header.get('timeout', None)
360 timeout = header.get('timeout', None)
361 if timeout:
361 if timeout:
362 timeout = datetime.now() + timedelta(0,timeout,0)
362 timeout = datetime.now() + timedelta(0,timeout,0)
363
363
364 args = [raw_msg, targets, after, follow, timeout]
364 args = [raw_msg, targets, after, follow, timeout]
365
365
366 # validate and reduce dependencies:
366 # validate and reduce dependencies:
367 for dep in after,follow:
367 for dep in after,follow:
368 if not dep: # empty dependency
368 if not dep: # empty dependency
369 continue
369 continue
370 # check valid:
370 # check valid:
371 if msg_id in dep or dep.difference(self.all_ids):
371 if msg_id in dep or dep.difference(self.all_ids):
372 self.depending[msg_id] = args
372 self.depending[msg_id] = args
373 return self.fail_unreachable(msg_id, error.InvalidDependency)
373 return self.fail_unreachable(msg_id, error.InvalidDependency)
374 # check if unreachable:
374 # check if unreachable:
375 if dep.unreachable(self.all_completed, self.all_failed):
375 if dep.unreachable(self.all_completed, self.all_failed):
376 self.depending[msg_id] = args
376 self.depending[msg_id] = args
377 return self.fail_unreachable(msg_id)
377 return self.fail_unreachable(msg_id)
378
378
379 if after.check(self.all_completed, self.all_failed):
379 if after.check(self.all_completed, self.all_failed):
380 # time deps already met, try to run
380 # time deps already met, try to run
381 if not self.maybe_run(msg_id, *args):
381 if not self.maybe_run(msg_id, *args):
382 # can't run yet
382 # can't run yet
383 if msg_id not in self.all_failed:
383 if msg_id not in self.all_failed:
384 # could have failed as unreachable
384 # could have failed as unreachable
385 self.save_unmet(msg_id, *args)
385 self.save_unmet(msg_id, *args)
386 else:
386 else:
387 self.save_unmet(msg_id, *args)
387 self.save_unmet(msg_id, *args)
388
388
389 def audit_timeouts(self):
389 def audit_timeouts(self):
390 """Audit all waiting tasks for expired timeouts."""
390 """Audit all waiting tasks for expired timeouts."""
391 now = datetime.now()
391 now = datetime.now()
392 for msg_id in self.depending.keys():
392 for msg_id in self.depending.keys():
393 # must recheck, in case one failure cascaded to another:
393 # must recheck, in case one failure cascaded to another:
394 if msg_id in self.depending:
394 if msg_id in self.depending:
395 raw,after,targets,follow,timeout = self.depending[msg_id]
395 raw,after,targets,follow,timeout = self.depending[msg_id]
396 if timeout and timeout < now:
396 if timeout and timeout < now:
397 self.fail_unreachable(msg_id, error.TaskTimeout)
397 self.fail_unreachable(msg_id, error.TaskTimeout)
398
398
399 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
399 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
400 """a task has become unreachable, send a reply with an ImpossibleDependency
400 """a task has become unreachable, send a reply with an ImpossibleDependency
401 error."""
401 error."""
402 if msg_id not in self.depending:
402 if msg_id not in self.depending:
403 self.log.error("msg %r already failed!", msg_id)
403 self.log.error("msg %r already failed!", msg_id)
404 return
404 return
405 raw_msg,targets,after,follow,timeout = self.depending.pop(msg_id)
405 raw_msg,targets,after,follow,timeout = self.depending.pop(msg_id)
406 for mid in follow.union(after):
406 for mid in follow.union(after):
407 if mid in self.graph:
407 if mid in self.graph:
408 self.graph[mid].remove(msg_id)
408 self.graph[mid].remove(msg_id)
409
409
410 # FIXME: unpacking a message I've already unpacked, but didn't save:
410 # FIXME: unpacking a message I've already unpacked, but didn't save:
411 idents,msg = self.session.feed_identities(raw_msg, copy=False)
411 idents,msg = self.session.feed_identities(raw_msg, copy=False)
412 header = self.session.unpack(msg[1].bytes)
412 header = self.session.unpack(msg[1].bytes)
413
413
414 try:
414 try:
415 raise why()
415 raise why()
416 except:
416 except:
417 content = error.wrap_exception()
417 content = error.wrap_exception()
418
418
419 self.all_done.add(msg_id)
419 self.all_done.add(msg_id)
420 self.all_failed.add(msg_id)
420 self.all_failed.add(msg_id)
421
421
422 msg = self.session.send(self.client_stream, 'apply_reply', content,
422 msg = self.session.send(self.client_stream, 'apply_reply', content,
423 parent=header, ident=idents)
423 parent=header, ident=idents)
424 self.session.send(self.mon_stream, msg, ident=[b'outtask']+idents)
424 self.session.send(self.mon_stream, msg, ident=[b'outtask']+idents)
425
425
426 self.update_graph(msg_id, success=False)
426 self.update_graph(msg_id, success=False)
427
427
428 def maybe_run(self, msg_id, raw_msg, targets, after, follow, timeout):
428 def maybe_run(self, msg_id, raw_msg, targets, after, follow, timeout):
429 """check location dependencies, and run if they are met."""
429 """check location dependencies, and run if they are met."""
430 blacklist = self.blacklist.setdefault(msg_id, set())
430 blacklist = self.blacklist.setdefault(msg_id, set())
431 if follow or targets or blacklist or self.hwm:
431 if follow or targets or blacklist or self.hwm:
432 # we need a can_run filter
432 # we need a can_run filter
433 def can_run(idx):
433 def can_run(idx):
434 # check hwm
434 # check hwm
435 if self.hwm and self.loads[idx] == self.hwm:
435 if self.hwm and self.loads[idx] == self.hwm:
436 return False
436 return False
437 target = self.targets[idx]
437 target = self.targets[idx]
438 # check blacklist
438 # check blacklist
439 if target in blacklist:
439 if target in blacklist:
440 return False
440 return False
441 # check targets
441 # check targets
442 if targets and target not in targets:
442 if targets and target not in targets:
443 return False
443 return False
444 # check follow
444 # check follow
445 return follow.check(self.completed[target], self.failed[target])
445 return follow.check(self.completed[target], self.failed[target])
446
446
447 indices = filter(can_run, range(len(self.targets)))
447 indices = filter(can_run, range(len(self.targets)))
448
448
449 if not indices:
449 if not indices:
450 # couldn't run
450 # couldn't run
451 if follow.all:
451 if follow.all:
452 # check follow for impossibility
452 # check follow for impossibility
453 dests = set()
453 dests = set()
454 relevant = set()
454 relevant = set()
455 if follow.success:
455 if follow.success:
456 relevant = self.all_completed
456 relevant = self.all_completed
457 if follow.failure:
457 if follow.failure:
458 relevant = relevant.union(self.all_failed)
458 relevant = relevant.union(self.all_failed)
459 for m in follow.intersection(relevant):
459 for m in follow.intersection(relevant):
460 dests.add(self.destinations[m])
460 dests.add(self.destinations[m])
461 if len(dests) > 1:
461 if len(dests) > 1:
462 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
462 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
463 self.fail_unreachable(msg_id)
463 self.fail_unreachable(msg_id)
464 return False
464 return False
465 if targets:
465 if targets:
466 # check blacklist+targets for impossibility
466 # check blacklist+targets for impossibility
467 targets.difference_update(blacklist)
467 targets.difference_update(blacklist)
468 if not targets or not targets.intersection(self.targets):
468 if not targets or not targets.intersection(self.targets):
469 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
469 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
470 self.fail_unreachable(msg_id)
470 self.fail_unreachable(msg_id)
471 return False
471 return False
472 return False
472 return False
473 else:
473 else:
474 indices = None
474 indices = None
475
475
476 self.submit_task(msg_id, raw_msg, targets, follow, timeout, indices)
476 self.submit_task(msg_id, raw_msg, targets, follow, timeout, indices)
477 return True
477 return True
478
478
479 def save_unmet(self, msg_id, raw_msg, targets, after, follow, timeout):
479 def save_unmet(self, msg_id, raw_msg, targets, after, follow, timeout):
480 """Save a message for later submission when its dependencies are met."""
480 """Save a message for later submission when its dependencies are met."""
481 self.depending[msg_id] = [raw_msg,targets,after,follow,timeout]
481 self.depending[msg_id] = [raw_msg,targets,after,follow,timeout]
482 # track the ids in follow or after, but not those already finished
482 # track the ids in follow or after, but not those already finished
483 for dep_id in after.union(follow).difference(self.all_done):
483 for dep_id in after.union(follow).difference(self.all_done):
484 if dep_id not in self.graph:
484 if dep_id not in self.graph:
485 self.graph[dep_id] = set()
485 self.graph[dep_id] = set()
486 self.graph[dep_id].add(msg_id)
486 self.graph[dep_id].add(msg_id)
487
487
488 def submit_task(self, msg_id, raw_msg, targets, follow, timeout, indices=None):
488 def submit_task(self, msg_id, raw_msg, targets, follow, timeout, indices=None):
489 """Submit a task to any of a subset of our targets."""
489 """Submit a task to any of a subset of our targets."""
490 if indices:
490 if indices:
491 loads = [self.loads[i] for i in indices]
491 loads = [self.loads[i] for i in indices]
492 else:
492 else:
493 loads = self.loads
493 loads = self.loads
494 idx = self.scheme(loads)
494 idx = self.scheme(loads)
495 if indices:
495 if indices:
496 idx = indices[idx]
496 idx = indices[idx]
497 target = self.targets[idx]
497 target = self.targets[idx]
498 # print (target, map(str, msg[:3]))
498 # print (target, map(str, msg[:3]))
499 # send job to the engine
499 # send job to the engine
500 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
500 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
501 self.engine_stream.send_multipart(raw_msg, copy=False)
501 self.engine_stream.send_multipart(raw_msg, copy=False)
502 # update load
502 # update load
503 self.add_job(idx)
503 self.add_job(idx)
504 self.pending[target][msg_id] = (raw_msg, targets, MET, follow, timeout)
504 self.pending[target][msg_id] = (raw_msg, targets, MET, follow, timeout)
505 # notify Hub
505 # notify Hub
506 content = dict(msg_id=msg_id, engine_id=target.decode())
506 content = dict(msg_id=msg_id, engine_id=target.decode('ascii'))
507 self.session.send(self.mon_stream, 'task_destination', content=content,
507 self.session.send(self.mon_stream, 'task_destination', content=content,
508 ident=[b'tracktask',self.ident])
508 ident=[b'tracktask',self.ident])
509
509
510
510
511 #-----------------------------------------------------------------------
511 #-----------------------------------------------------------------------
512 # Result Handling
512 # Result Handling
513 #-----------------------------------------------------------------------
513 #-----------------------------------------------------------------------
514 def dispatch_result(self, raw_msg):
514 def dispatch_result(self, raw_msg):
515 """dispatch method for result replies"""
515 """dispatch method for result replies"""
516 try:
516 try:
517 idents,msg = self.session.feed_identities(raw_msg, copy=False)
517 idents,msg = self.session.feed_identities(raw_msg, copy=False)
518 msg = self.session.unpack_message(msg, content=False, copy=False)
518 msg = self.session.unpack_message(msg, content=False, copy=False)
519 engine = idents[0]
519 engine = idents[0]
520 try:
520 try:
521 idx = self.targets.index(engine)
521 idx = self.targets.index(engine)
522 except ValueError:
522 except ValueError:
523 pass # skip load-update for dead engines
523 pass # skip load-update for dead engines
524 else:
524 else:
525 self.finish_job(idx)
525 self.finish_job(idx)
526 except Exception:
526 except Exception:
527 self.log.error("task::Invaid result: %r", raw_msg, exc_info=True)
527 self.log.error("task::Invaid result: %r", raw_msg, exc_info=True)
528 return
528 return
529
529
530 header = msg['header']
530 header = msg['header']
531 parent = msg['parent_header']
531 parent = msg['parent_header']
532 if header.get('dependencies_met', True):
532 if header.get('dependencies_met', True):
533 success = (header['status'] == 'ok')
533 success = (header['status'] == 'ok')
534 msg_id = parent['msg_id']
534 msg_id = parent['msg_id']
535 retries = self.retries[msg_id]
535 retries = self.retries[msg_id]
536 if not success and retries > 0:
536 if not success and retries > 0:
537 # failed
537 # failed
538 self.retries[msg_id] = retries - 1
538 self.retries[msg_id] = retries - 1
539 self.handle_unmet_dependency(idents, parent)
539 self.handle_unmet_dependency(idents, parent)
540 else:
540 else:
541 del self.retries[msg_id]
541 del self.retries[msg_id]
542 # relay to client and update graph
542 # relay to client and update graph
543 self.handle_result(idents, parent, raw_msg, success)
543 self.handle_result(idents, parent, raw_msg, success)
544 # send to Hub monitor
544 # send to Hub monitor
545 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
545 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
546 else:
546 else:
547 self.handle_unmet_dependency(idents, parent)
547 self.handle_unmet_dependency(idents, parent)
548
548
549 def handle_result(self, idents, parent, raw_msg, success=True):
549 def handle_result(self, idents, parent, raw_msg, success=True):
550 """handle a real task result, either success or failure"""
550 """handle a real task result, either success or failure"""
551 # first, relay result to client
551 # first, relay result to client
552 engine = idents[0]
552 engine = idents[0]
553 client = idents[1]
553 client = idents[1]
554 # swap_ids for XREP-XREP mirror
554 # swap_ids for XREP-XREP mirror
555 raw_msg[:2] = [client,engine]
555 raw_msg[:2] = [client,engine]
556 # print (map(str, raw_msg[:4]))
556 # print (map(str, raw_msg[:4]))
557 self.client_stream.send_multipart(raw_msg, copy=False)
557 self.client_stream.send_multipart(raw_msg, copy=False)
558 # now, update our data structures
558 # now, update our data structures
559 msg_id = parent['msg_id']
559 msg_id = parent['msg_id']
560 self.blacklist.pop(msg_id, None)
560 self.blacklist.pop(msg_id, None)
561 self.pending[engine].pop(msg_id)
561 self.pending[engine].pop(msg_id)
562 if success:
562 if success:
563 self.completed[engine].add(msg_id)
563 self.completed[engine].add(msg_id)
564 self.all_completed.add(msg_id)
564 self.all_completed.add(msg_id)
565 else:
565 else:
566 self.failed[engine].add(msg_id)
566 self.failed[engine].add(msg_id)
567 self.all_failed.add(msg_id)
567 self.all_failed.add(msg_id)
568 self.all_done.add(msg_id)
568 self.all_done.add(msg_id)
569 self.destinations[msg_id] = engine
569 self.destinations[msg_id] = engine
570
570
571 self.update_graph(msg_id, success)
571 self.update_graph(msg_id, success)
572
572
573 def handle_unmet_dependency(self, idents, parent):
573 def handle_unmet_dependency(self, idents, parent):
574 """handle an unmet dependency"""
574 """handle an unmet dependency"""
575 engine = idents[0]
575 engine = idents[0]
576 msg_id = parent['msg_id']
576 msg_id = parent['msg_id']
577
577
578 if msg_id not in self.blacklist:
578 if msg_id not in self.blacklist:
579 self.blacklist[msg_id] = set()
579 self.blacklist[msg_id] = set()
580 self.blacklist[msg_id].add(engine)
580 self.blacklist[msg_id].add(engine)
581
581
582 args = self.pending[engine].pop(msg_id)
582 args = self.pending[engine].pop(msg_id)
583 raw,targets,after,follow,timeout = args
583 raw,targets,after,follow,timeout = args
584
584
585 if self.blacklist[msg_id] == targets:
585 if self.blacklist[msg_id] == targets:
586 self.depending[msg_id] = args
586 self.depending[msg_id] = args
587 self.fail_unreachable(msg_id)
587 self.fail_unreachable(msg_id)
588 elif not self.maybe_run(msg_id, *args):
588 elif not self.maybe_run(msg_id, *args):
589 # resubmit failed
589 # resubmit failed
590 if msg_id not in self.all_failed:
590 if msg_id not in self.all_failed:
591 # put it back in our dependency tree
591 # put it back in our dependency tree
592 self.save_unmet(msg_id, *args)
592 self.save_unmet(msg_id, *args)
593
593
594 if self.hwm:
594 if self.hwm:
595 try:
595 try:
596 idx = self.targets.index(engine)
596 idx = self.targets.index(engine)
597 except ValueError:
597 except ValueError:
598 pass # skip load-update for dead engines
598 pass # skip load-update for dead engines
599 else:
599 else:
600 if self.loads[idx] == self.hwm-1:
600 if self.loads[idx] == self.hwm-1:
601 self.update_graph(None)
601 self.update_graph(None)
602
602
603
603
604
604
605 def update_graph(self, dep_id=None, success=True):
605 def update_graph(self, dep_id=None, success=True):
606 """dep_id just finished. Update our dependency
606 """dep_id just finished. Update our dependency
607 graph and submit any jobs that just became runable.
607 graph and submit any jobs that just became runable.
608
608
609 Called with dep_id=None to update entire graph for hwm, but without finishing
609 Called with dep_id=None to update entire graph for hwm, but without finishing
610 a task.
610 a task.
611 """
611 """
612 # print ("\n\n***********")
612 # print ("\n\n***********")
613 # pprint (dep_id)
613 # pprint (dep_id)
614 # pprint (self.graph)
614 # pprint (self.graph)
615 # pprint (self.depending)
615 # pprint (self.depending)
616 # pprint (self.all_completed)
616 # pprint (self.all_completed)
617 # pprint (self.all_failed)
617 # pprint (self.all_failed)
618 # print ("\n\n***********\n\n")
618 # print ("\n\n***********\n\n")
619 # update any jobs that depended on the dependency
619 # update any jobs that depended on the dependency
620 jobs = self.graph.pop(dep_id, [])
620 jobs = self.graph.pop(dep_id, [])
621
621
622 # recheck *all* jobs if
622 # recheck *all* jobs if
623 # a) we have HWM and an engine just become no longer full
623 # a) we have HWM and an engine just become no longer full
624 # or b) dep_id was given as None
624 # or b) dep_id was given as None
625 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
625 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
626 jobs = self.depending.keys()
626 jobs = self.depending.keys()
627
627
628 for msg_id in jobs:
628 for msg_id in jobs:
629 raw_msg, targets, after, follow, timeout = self.depending[msg_id]
629 raw_msg, targets, after, follow, timeout = self.depending[msg_id]
630
630
631 if after.unreachable(self.all_completed, self.all_failed)\
631 if after.unreachable(self.all_completed, self.all_failed)\
632 or follow.unreachable(self.all_completed, self.all_failed):
632 or follow.unreachable(self.all_completed, self.all_failed):
633 self.fail_unreachable(msg_id)
633 self.fail_unreachable(msg_id)
634
634
635 elif after.check(self.all_completed, self.all_failed): # time deps met, maybe run
635 elif after.check(self.all_completed, self.all_failed): # time deps met, maybe run
636 if self.maybe_run(msg_id, raw_msg, targets, MET, follow, timeout):
636 if self.maybe_run(msg_id, raw_msg, targets, MET, follow, timeout):
637
637
638 self.depending.pop(msg_id)
638 self.depending.pop(msg_id)
639 for mid in follow.union(after):
639 for mid in follow.union(after):
640 if mid in self.graph:
640 if mid in self.graph:
641 self.graph[mid].remove(msg_id)
641 self.graph[mid].remove(msg_id)
642
642
643 #----------------------------------------------------------------------
643 #----------------------------------------------------------------------
644 # methods to be overridden by subclasses
644 # methods to be overridden by subclasses
645 #----------------------------------------------------------------------
645 #----------------------------------------------------------------------
646
646
647 def add_job(self, idx):
647 def add_job(self, idx):
648 """Called after self.targets[idx] just got the job with header.
648 """Called after self.targets[idx] just got the job with header.
649 Override with subclasses. The default ordering is simple LRU.
649 Override with subclasses. The default ordering is simple LRU.
650 The default loads are the number of outstanding jobs."""
650 The default loads are the number of outstanding jobs."""
651 self.loads[idx] += 1
651 self.loads[idx] += 1
652 for lis in (self.targets, self.loads):
652 for lis in (self.targets, self.loads):
653 lis.append(lis.pop(idx))
653 lis.append(lis.pop(idx))
654
654
655
655
656 def finish_job(self, idx):
656 def finish_job(self, idx):
657 """Called after self.targets[idx] just finished a job.
657 """Called after self.targets[idx] just finished a job.
658 Override with subclasses."""
658 Override with subclasses."""
659 self.loads[idx] -= 1
659 self.loads[idx] -= 1
660
660
661
661
662
662
663 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, config=None,
663 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, config=None,
664 logname='root', log_url=None, loglevel=logging.DEBUG,
664 logname='root', log_url=None, loglevel=logging.DEBUG,
665 identity=b'task', in_thread=False):
665 identity=b'task', in_thread=False):
666
666
667 ZMQStream = zmqstream.ZMQStream
667 ZMQStream = zmqstream.ZMQStream
668
668
669 if config:
669 if config:
670 # unwrap dict back into Config
670 # unwrap dict back into Config
671 config = Config(config)
671 config = Config(config)
672
672
673 if in_thread:
673 if in_thread:
674 # use instance() to get the same Context/Loop as our parent
674 # use instance() to get the same Context/Loop as our parent
675 ctx = zmq.Context.instance()
675 ctx = zmq.Context.instance()
676 loop = ioloop.IOLoop.instance()
676 loop = ioloop.IOLoop.instance()
677 else:
677 else:
678 # in a process, don't use instance()
678 # in a process, don't use instance()
679 # for safety with multiprocessing
679 # for safety with multiprocessing
680 ctx = zmq.Context()
680 ctx = zmq.Context()
681 loop = ioloop.IOLoop()
681 loop = ioloop.IOLoop()
682 ins = ZMQStream(ctx.socket(zmq.XREP),loop)
682 ins = ZMQStream(ctx.socket(zmq.XREP),loop)
683 ins.setsockopt(zmq.IDENTITY, identity)
683 ins.setsockopt(zmq.IDENTITY, identity)
684 ins.bind(in_addr)
684 ins.bind(in_addr)
685
685
686 outs = ZMQStream(ctx.socket(zmq.XREP),loop)
686 outs = ZMQStream(ctx.socket(zmq.XREP),loop)
687 outs.setsockopt(zmq.IDENTITY, identity)
687 outs.setsockopt(zmq.IDENTITY, identity)
688 outs.bind(out_addr)
688 outs.bind(out_addr)
689 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
689 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
690 mons.connect(mon_addr)
690 mons.connect(mon_addr)
691 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
691 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
692 nots.setsockopt(zmq.SUBSCRIBE, b'')
692 nots.setsockopt(zmq.SUBSCRIBE, b'')
693 nots.connect(not_addr)
693 nots.connect(not_addr)
694
694
695 # setup logging.
695 # setup logging.
696 if in_thread:
696 if in_thread:
697 log = Application.instance().log
697 log = Application.instance().log
698 else:
698 else:
699 if log_url:
699 if log_url:
700 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
700 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
701 else:
701 else:
702 log = local_logger(logname, loglevel)
702 log = local_logger(logname, loglevel)
703
703
704 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
704 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
705 mon_stream=mons, notifier_stream=nots,
705 mon_stream=mons, notifier_stream=nots,
706 loop=loop, log=log,
706 loop=loop, log=log,
707 config=config)
707 config=config)
708 scheduler.start()
708 scheduler.start()
709 if not in_thread:
709 if not in_thread:
710 try:
710 try:
711 loop.start()
711 loop.start()
712 except KeyboardInterrupt:
712 except KeyboardInterrupt:
713 print ("interrupted, exiting...", file=sys.__stderr__)
713 print ("interrupted, exiting...", file=sys.__stderr__)
714
714
@@ -1,170 +1,174
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 """A simple engine that talks to a controller over 0MQ.
2 """A simple engine that talks to a controller over 0MQ.
3 it handles registration, etc. and launches a kernel
3 it handles registration, etc. and launches a kernel
4 connected to the Controller's Schedulers.
4 connected to the Controller's Schedulers.
5
5
6 Authors:
6 Authors:
7
7
8 * Min RK
8 * Min RK
9 """
9 """
10 #-----------------------------------------------------------------------------
10 #-----------------------------------------------------------------------------
11 # Copyright (C) 2010-2011 The IPython Development Team
11 # Copyright (C) 2010-2011 The IPython Development Team
12 #
12 #
13 # Distributed under the terms of the BSD License. The full license is in
13 # Distributed under the terms of the BSD License. The full license is in
14 # the file COPYING, distributed as part of this software.
14 # the file COPYING, distributed as part of this software.
15 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
16
16
17 from __future__ import print_function
17 from __future__ import print_function
18
18
19 import sys
19 import sys
20 import time
20 import time
21
21
22 import zmq
22 import zmq
23 from zmq.eventloop import ioloop, zmqstream
23 from zmq.eventloop import ioloop, zmqstream
24
24
25 # internal
25 # internal
26 from IPython.utils.traitlets import Instance, Dict, Int, Type, CFloat, Unicode
26 from IPython.utils.traitlets import Instance, Dict, Int, Type, CFloat, Unicode, CBytes
27 # from IPython.utils.localinterfaces import LOCALHOST
27 # from IPython.utils.localinterfaces import LOCALHOST
28
28
29 from IPython.parallel.controller.heartmonitor import Heart
29 from IPython.parallel.controller.heartmonitor import Heart
30 from IPython.parallel.factory import RegistrationFactory
30 from IPython.parallel.factory import RegistrationFactory
31 from IPython.parallel.util import disambiguate_url, ensure_bytes
31 from IPython.parallel.util import disambiguate_url, ensure_bytes
32
32
33 from IPython.zmq.session import Message
33 from IPython.zmq.session import Message
34
34
35 from .streamkernel import Kernel
35 from .streamkernel import Kernel
36
36
37 class EngineFactory(RegistrationFactory):
37 class EngineFactory(RegistrationFactory):
38 """IPython engine"""
38 """IPython engine"""
39
39
40 # configurables:
40 # configurables:
41 out_stream_factory=Type('IPython.zmq.iostream.OutStream', config=True,
41 out_stream_factory=Type('IPython.zmq.iostream.OutStream', config=True,
42 help="""The OutStream for handling stdout/err.
42 help="""The OutStream for handling stdout/err.
43 Typically 'IPython.zmq.iostream.OutStream'""")
43 Typically 'IPython.zmq.iostream.OutStream'""")
44 display_hook_factory=Type('IPython.zmq.displayhook.ZMQDisplayHook', config=True,
44 display_hook_factory=Type('IPython.zmq.displayhook.ZMQDisplayHook', config=True,
45 help="""The class for handling displayhook.
45 help="""The class for handling displayhook.
46 Typically 'IPython.zmq.displayhook.ZMQDisplayHook'""")
46 Typically 'IPython.zmq.displayhook.ZMQDisplayHook'""")
47 location=Unicode(config=True,
47 location=Unicode(config=True,
48 help="""The location (an IP address) of the controller. This is
48 help="""The location (an IP address) of the controller. This is
49 used for disambiguating URLs, to determine whether
49 used for disambiguating URLs, to determine whether
50 loopback should be used to connect or the public address.""")
50 loopback should be used to connect or the public address.""")
51 timeout=CFloat(2,config=True,
51 timeout=CFloat(2,config=True,
52 help="""The time (in seconds) to wait for the Controller to respond
52 help="""The time (in seconds) to wait for the Controller to respond
53 to registration requests before giving up.""")
53 to registration requests before giving up.""")
54
54
55 # not configurable:
55 # not configurable:
56 user_ns=Dict()
56 user_ns=Dict()
57 id=Int(allow_none=True)
57 id=Int(allow_none=True)
58 registrar=Instance('zmq.eventloop.zmqstream.ZMQStream')
58 registrar=Instance('zmq.eventloop.zmqstream.ZMQStream')
59 kernel=Instance(Kernel)
59 kernel=Instance(Kernel)
60
60
61 bident = CBytes()
62 ident = Unicode()
63 def _ident_changed(self, name, old, new):
64 self.bident = ensure_bytes(new)
65
61
66
62 def __init__(self, **kwargs):
67 def __init__(self, **kwargs):
63 super(EngineFactory, self).__init__(**kwargs)
68 super(EngineFactory, self).__init__(**kwargs)
64 self.ident = self.session.session
69 self.ident = self.session.session
65 ctx = self.context
70 ctx = self.context
66
71
67 reg = ctx.socket(zmq.XREQ)
72 reg = ctx.socket(zmq.XREQ)
68 reg.setsockopt(zmq.IDENTITY, ensure_bytes(self.ident))
73 reg.setsockopt(zmq.IDENTITY, self.bident)
69 reg.connect(self.url)
74 reg.connect(self.url)
70 self.registrar = zmqstream.ZMQStream(reg, self.loop)
75 self.registrar = zmqstream.ZMQStream(reg, self.loop)
71
76
72 def register(self):
77 def register(self):
73 """send the registration_request"""
78 """send the registration_request"""
74
79
75 self.log.info("Registering with controller at %s"%self.url)
80 self.log.info("Registering with controller at %s"%self.url)
76 content = dict(queue=self.ident, heartbeat=self.ident, control=self.ident)
81 content = dict(queue=self.ident, heartbeat=self.ident, control=self.ident)
77 self.registrar.on_recv(self.complete_registration)
82 self.registrar.on_recv(self.complete_registration)
78 # print (self.session.key)
83 # print (self.session.key)
79 self.session.send(self.registrar, "registration_request",content=content)
84 self.session.send(self.registrar, "registration_request",content=content)
80
85
81 def complete_registration(self, msg):
86 def complete_registration(self, msg):
82 # print msg
87 # print msg
83 self._abort_dc.stop()
88 self._abort_dc.stop()
84 ctx = self.context
89 ctx = self.context
85 loop = self.loop
90 loop = self.loop
86 identity = ensure_bytes(self.ident)
91 identity = self.bident
87
88 idents,msg = self.session.feed_identities(msg)
92 idents,msg = self.session.feed_identities(msg)
89 msg = Message(self.session.unpack_message(msg))
93 msg = Message(self.session.unpack_message(msg))
90
94
91 if msg.content.status == 'ok':
95 if msg.content.status == 'ok':
92 self.id = int(msg.content.id)
96 self.id = int(msg.content.id)
93
97
94 # create Shell Streams (MUX, Task, etc.):
98 # create Shell Streams (MUX, Task, etc.):
95 queue_addr = msg.content.mux
99 queue_addr = msg.content.mux
96 shell_addrs = [ str(queue_addr) ]
100 shell_addrs = [ str(queue_addr) ]
97 task_addr = msg.content.task
101 task_addr = msg.content.task
98 if task_addr:
102 if task_addr:
99 shell_addrs.append(str(task_addr))
103 shell_addrs.append(str(task_addr))
100
104
101 # Uncomment this to go back to two-socket model
105 # Uncomment this to go back to two-socket model
102 # shell_streams = []
106 # shell_streams = []
103 # for addr in shell_addrs:
107 # for addr in shell_addrs:
104 # stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
108 # stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
105 # stream.setsockopt(zmq.IDENTITY, identity)
109 # stream.setsockopt(zmq.IDENTITY, identity)
106 # stream.connect(disambiguate_url(addr, self.location))
110 # stream.connect(disambiguate_url(addr, self.location))
107 # shell_streams.append(stream)
111 # shell_streams.append(stream)
108
112
109 # Now use only one shell stream for mux and tasks
113 # Now use only one shell stream for mux and tasks
110 stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
114 stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
111 stream.setsockopt(zmq.IDENTITY, identity)
115 stream.setsockopt(zmq.IDENTITY, identity)
112 shell_streams = [stream]
116 shell_streams = [stream]
113 for addr in shell_addrs:
117 for addr in shell_addrs:
114 stream.connect(disambiguate_url(addr, self.location))
118 stream.connect(disambiguate_url(addr, self.location))
115 # end single stream-socket
119 # end single stream-socket
116
120
117 # control stream:
121 # control stream:
118 control_addr = str(msg.content.control)
122 control_addr = str(msg.content.control)
119 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
123 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
120 control_stream.setsockopt(zmq.IDENTITY, identity)
124 control_stream.setsockopt(zmq.IDENTITY, identity)
121 control_stream.connect(disambiguate_url(control_addr, self.location))
125 control_stream.connect(disambiguate_url(control_addr, self.location))
122
126
123 # create iopub stream:
127 # create iopub stream:
124 iopub_addr = msg.content.iopub
128 iopub_addr = msg.content.iopub
125 iopub_stream = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop)
129 iopub_stream = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop)
126 iopub_stream.setsockopt(zmq.IDENTITY, identity)
130 iopub_stream.setsockopt(zmq.IDENTITY, identity)
127 iopub_stream.connect(disambiguate_url(iopub_addr, self.location))
131 iopub_stream.connect(disambiguate_url(iopub_addr, self.location))
128
132
129 # launch heartbeat
133 # launch heartbeat
130 hb_addrs = msg.content.heartbeat
134 hb_addrs = msg.content.heartbeat
131 # print (hb_addrs)
135 # print (hb_addrs)
132
136
133 # # Redirect input streams and set a display hook.
137 # # Redirect input streams and set a display hook.
134 if self.out_stream_factory:
138 if self.out_stream_factory:
135 sys.stdout = self.out_stream_factory(self.session, iopub_stream, u'stdout')
139 sys.stdout = self.out_stream_factory(self.session, iopub_stream, u'stdout')
136 sys.stdout.topic = 'engine.%i.stdout'%self.id
140 sys.stdout.topic = 'engine.%i.stdout'%self.id
137 sys.stderr = self.out_stream_factory(self.session, iopub_stream, u'stderr')
141 sys.stderr = self.out_stream_factory(self.session, iopub_stream, u'stderr')
138 sys.stderr.topic = 'engine.%i.stderr'%self.id
142 sys.stderr.topic = 'engine.%i.stderr'%self.id
139 if self.display_hook_factory:
143 if self.display_hook_factory:
140 sys.displayhook = self.display_hook_factory(self.session, iopub_stream)
144 sys.displayhook = self.display_hook_factory(self.session, iopub_stream)
141 sys.displayhook.topic = 'engine.%i.pyout'%self.id
145 sys.displayhook.topic = 'engine.%i.pyout'%self.id
142
146
143 self.kernel = Kernel(config=self.config, int_id=self.id, ident=self.ident, session=self.session,
147 self.kernel = Kernel(config=self.config, int_id=self.id, ident=self.ident, session=self.session,
144 control_stream=control_stream, shell_streams=shell_streams, iopub_stream=iopub_stream,
148 control_stream=control_stream, shell_streams=shell_streams, iopub_stream=iopub_stream,
145 loop=loop, user_ns = self.user_ns, log=self.log)
149 loop=loop, user_ns = self.user_ns, log=self.log)
146 self.kernel.start()
150 self.kernel.start()
147 hb_addrs = [ disambiguate_url(addr, self.location) for addr in hb_addrs ]
151 hb_addrs = [ disambiguate_url(addr, self.location) for addr in hb_addrs ]
148 heart = Heart(*map(str, hb_addrs), heart_id=identity)
152 heart = Heart(*map(str, hb_addrs), heart_id=identity)
149 heart.start()
153 heart.start()
150
154
151
155
152 else:
156 else:
153 self.log.fatal("Registration Failed: %s"%msg)
157 self.log.fatal("Registration Failed: %s"%msg)
154 raise Exception("Registration Failed: %s"%msg)
158 raise Exception("Registration Failed: %s"%msg)
155
159
156 self.log.info("Completed registration with id %i"%self.id)
160 self.log.info("Completed registration with id %i"%self.id)
157
161
158
162
159 def abort(self):
163 def abort(self):
160 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
164 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
161 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
165 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
162 time.sleep(1)
166 time.sleep(1)
163 sys.exit(255)
167 sys.exit(255)
164
168
165 def start(self):
169 def start(self):
166 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
170 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
167 dc.start()
171 dc.start()
168 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
172 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
169 self._abort_dc.start()
173 self._abort_dc.start()
170
174
@@ -1,456 +1,456
1 """some generic utilities for dealing with classes, urls, and serialization
1 """some generic utilities for dealing with classes, urls, and serialization
2
2
3 Authors:
3 Authors:
4
4
5 * Min RK
5 * Min RK
6 """
6 """
7 #-----------------------------------------------------------------------------
7 #-----------------------------------------------------------------------------
8 # Copyright (C) 2010-2011 The IPython Development Team
8 # Copyright (C) 2010-2011 The IPython Development Team
9 #
9 #
10 # Distributed under the terms of the BSD License. The full license is in
10 # Distributed under the terms of the BSD License. The full license is in
11 # the file COPYING, distributed as part of this software.
11 # the file COPYING, distributed as part of this software.
12 #-----------------------------------------------------------------------------
12 #-----------------------------------------------------------------------------
13
13
14 #-----------------------------------------------------------------------------
14 #-----------------------------------------------------------------------------
15 # Imports
15 # Imports
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17
17
18 # Standard library imports.
18 # Standard library imports.
19 import logging
19 import logging
20 import os
20 import os
21 import re
21 import re
22 import stat
22 import stat
23 import socket
23 import socket
24 import sys
24 import sys
25 from signal import signal, SIGINT, SIGABRT, SIGTERM
25 from signal import signal, SIGINT, SIGABRT, SIGTERM
26 try:
26 try:
27 from signal import SIGKILL
27 from signal import SIGKILL
28 except ImportError:
28 except ImportError:
29 SIGKILL=None
29 SIGKILL=None
30
30
31 try:
31 try:
32 import cPickle
32 import cPickle
33 pickle = cPickle
33 pickle = cPickle
34 except:
34 except:
35 cPickle = None
35 cPickle = None
36 import pickle
36 import pickle
37
37
38 # System library imports
38 # System library imports
39 import zmq
39 import zmq
40 from zmq.log import handlers
40 from zmq.log import handlers
41
41
42 # IPython imports
42 # IPython imports
43 from IPython.utils.pickleutil import can, uncan, canSequence, uncanSequence
43 from IPython.utils.pickleutil import can, uncan, canSequence, uncanSequence
44 from IPython.utils.newserialized import serialize, unserialize
44 from IPython.utils.newserialized import serialize, unserialize
45 from IPython.zmq.log import EnginePUBHandler
45 from IPython.zmq.log import EnginePUBHandler
46
46
47 #-----------------------------------------------------------------------------
47 #-----------------------------------------------------------------------------
48 # Classes
48 # Classes
49 #-----------------------------------------------------------------------------
49 #-----------------------------------------------------------------------------
50
50
51 class Namespace(dict):
51 class Namespace(dict):
52 """Subclass of dict for attribute access to keys."""
52 """Subclass of dict for attribute access to keys."""
53
53
54 def __getattr__(self, key):
54 def __getattr__(self, key):
55 """getattr aliased to getitem"""
55 """getattr aliased to getitem"""
56 if key in self.iterkeys():
56 if key in self.iterkeys():
57 return self[key]
57 return self[key]
58 else:
58 else:
59 raise NameError(key)
59 raise NameError(key)
60
60
61 def __setattr__(self, key, value):
61 def __setattr__(self, key, value):
62 """setattr aliased to setitem, with strict"""
62 """setattr aliased to setitem, with strict"""
63 if hasattr(dict, key):
63 if hasattr(dict, key):
64 raise KeyError("Cannot override dict keys %r"%key)
64 raise KeyError("Cannot override dict keys %r"%key)
65 self[key] = value
65 self[key] = value
66
66
67
67
68 class ReverseDict(dict):
68 class ReverseDict(dict):
69 """simple double-keyed subset of dict methods."""
69 """simple double-keyed subset of dict methods."""
70
70
71 def __init__(self, *args, **kwargs):
71 def __init__(self, *args, **kwargs):
72 dict.__init__(self, *args, **kwargs)
72 dict.__init__(self, *args, **kwargs)
73 self._reverse = dict()
73 self._reverse = dict()
74 for key, value in self.iteritems():
74 for key, value in self.iteritems():
75 self._reverse[value] = key
75 self._reverse[value] = key
76
76
77 def __getitem__(self, key):
77 def __getitem__(self, key):
78 try:
78 try:
79 return dict.__getitem__(self, key)
79 return dict.__getitem__(self, key)
80 except KeyError:
80 except KeyError:
81 return self._reverse[key]
81 return self._reverse[key]
82
82
83 def __setitem__(self, key, value):
83 def __setitem__(self, key, value):
84 if key in self._reverse:
84 if key in self._reverse:
85 raise KeyError("Can't have key %r on both sides!"%key)
85 raise KeyError("Can't have key %r on both sides!"%key)
86 dict.__setitem__(self, key, value)
86 dict.__setitem__(self, key, value)
87 self._reverse[value] = key
87 self._reverse[value] = key
88
88
89 def pop(self, key):
89 def pop(self, key):
90 value = dict.pop(self, key)
90 value = dict.pop(self, key)
91 self._reverse.pop(value)
91 self._reverse.pop(value)
92 return value
92 return value
93
93
94 def get(self, key, default=None):
94 def get(self, key, default=None):
95 try:
95 try:
96 return self[key]
96 return self[key]
97 except KeyError:
97 except KeyError:
98 return default
98 return default
99
99
100 #-----------------------------------------------------------------------------
100 #-----------------------------------------------------------------------------
101 # Functions
101 # Functions
102 #-----------------------------------------------------------------------------
102 #-----------------------------------------------------------------------------
103
103
104 def ensure_bytes(s):
104 def ensure_bytes(s):
105 """ensure that an object is bytes"""
105 """ensure that an object is ascii bytes"""
106 if isinstance(s, unicode):
106 if isinstance(s, unicode):
107 s = s.encode(sys.getdefaultencoding(), 'replace')
107 s = s.encode('ascii')
108 return s
108 return s
109
109
110 def validate_url(url):
110 def validate_url(url):
111 """validate a url for zeromq"""
111 """validate a url for zeromq"""
112 if not isinstance(url, basestring):
112 if not isinstance(url, basestring):
113 raise TypeError("url must be a string, not %r"%type(url))
113 raise TypeError("url must be a string, not %r"%type(url))
114 url = url.lower()
114 url = url.lower()
115
115
116 proto_addr = url.split('://')
116 proto_addr = url.split('://')
117 assert len(proto_addr) == 2, 'Invalid url: %r'%url
117 assert len(proto_addr) == 2, 'Invalid url: %r'%url
118 proto, addr = proto_addr
118 proto, addr = proto_addr
119 assert proto in ['tcp','pgm','epgm','ipc','inproc'], "Invalid protocol: %r"%proto
119 assert proto in ['tcp','pgm','epgm','ipc','inproc'], "Invalid protocol: %r"%proto
120
120
121 # domain pattern adapted from http://www.regexlib.com/REDetails.aspx?regexp_id=391
121 # domain pattern adapted from http://www.regexlib.com/REDetails.aspx?regexp_id=391
122 # author: Remi Sabourin
122 # author: Remi Sabourin
123 pat = re.compile(r'^([\w\d]([\w\d\-]{0,61}[\w\d])?\.)*[\w\d]([\w\d\-]{0,61}[\w\d])?$')
123 pat = re.compile(r'^([\w\d]([\w\d\-]{0,61}[\w\d])?\.)*[\w\d]([\w\d\-]{0,61}[\w\d])?$')
124
124
125 if proto == 'tcp':
125 if proto == 'tcp':
126 lis = addr.split(':')
126 lis = addr.split(':')
127 assert len(lis) == 2, 'Invalid url: %r'%url
127 assert len(lis) == 2, 'Invalid url: %r'%url
128 addr,s_port = lis
128 addr,s_port = lis
129 try:
129 try:
130 port = int(s_port)
130 port = int(s_port)
131 except ValueError:
131 except ValueError:
132 raise AssertionError("Invalid port %r in url: %r"%(port, url))
132 raise AssertionError("Invalid port %r in url: %r"%(port, url))
133
133
134 assert addr == '*' or pat.match(addr) is not None, 'Invalid url: %r'%url
134 assert addr == '*' or pat.match(addr) is not None, 'Invalid url: %r'%url
135
135
136 else:
136 else:
137 # only validate tcp urls currently
137 # only validate tcp urls currently
138 pass
138 pass
139
139
140 return True
140 return True
141
141
142
142
143 def validate_url_container(container):
143 def validate_url_container(container):
144 """validate a potentially nested collection of urls."""
144 """validate a potentially nested collection of urls."""
145 if isinstance(container, basestring):
145 if isinstance(container, basestring):
146 url = container
146 url = container
147 return validate_url(url)
147 return validate_url(url)
148 elif isinstance(container, dict):
148 elif isinstance(container, dict):
149 container = container.itervalues()
149 container = container.itervalues()
150
150
151 for element in container:
151 for element in container:
152 validate_url_container(element)
152 validate_url_container(element)
153
153
154
154
155 def split_url(url):
155 def split_url(url):
156 """split a zmq url (tcp://ip:port) into ('tcp','ip','port')."""
156 """split a zmq url (tcp://ip:port) into ('tcp','ip','port')."""
157 proto_addr = url.split('://')
157 proto_addr = url.split('://')
158 assert len(proto_addr) == 2, 'Invalid url: %r'%url
158 assert len(proto_addr) == 2, 'Invalid url: %r'%url
159 proto, addr = proto_addr
159 proto, addr = proto_addr
160 lis = addr.split(':')
160 lis = addr.split(':')
161 assert len(lis) == 2, 'Invalid url: %r'%url
161 assert len(lis) == 2, 'Invalid url: %r'%url
162 addr,s_port = lis
162 addr,s_port = lis
163 return proto,addr,s_port
163 return proto,addr,s_port
164
164
165 def disambiguate_ip_address(ip, location=None):
165 def disambiguate_ip_address(ip, location=None):
166 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
166 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
167 ones, based on the location (default interpretation of location is localhost)."""
167 ones, based on the location (default interpretation of location is localhost)."""
168 if ip in ('0.0.0.0', '*'):
168 if ip in ('0.0.0.0', '*'):
169 external_ips = socket.gethostbyname_ex(socket.gethostname())[2]
169 external_ips = socket.gethostbyname_ex(socket.gethostname())[2]
170 if location is None or location in external_ips:
170 if location is None or location in external_ips:
171 ip='127.0.0.1'
171 ip='127.0.0.1'
172 elif location:
172 elif location:
173 return location
173 return location
174 return ip
174 return ip
175
175
176 def disambiguate_url(url, location=None):
176 def disambiguate_url(url, location=None):
177 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
177 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
178 ones, based on the location (default interpretation is localhost).
178 ones, based on the location (default interpretation is localhost).
179
179
180 This is for zeromq urls, such as tcp://*:10101."""
180 This is for zeromq urls, such as tcp://*:10101."""
181 try:
181 try:
182 proto,ip,port = split_url(url)
182 proto,ip,port = split_url(url)
183 except AssertionError:
183 except AssertionError:
184 # probably not tcp url; could be ipc, etc.
184 # probably not tcp url; could be ipc, etc.
185 return url
185 return url
186
186
187 ip = disambiguate_ip_address(ip,location)
187 ip = disambiguate_ip_address(ip,location)
188
188
189 return "%s://%s:%s"%(proto,ip,port)
189 return "%s://%s:%s"%(proto,ip,port)
190
190
191 def serialize_object(obj, threshold=64e-6):
191 def serialize_object(obj, threshold=64e-6):
192 """Serialize an object into a list of sendable buffers.
192 """Serialize an object into a list of sendable buffers.
193
193
194 Parameters
194 Parameters
195 ----------
195 ----------
196
196
197 obj : object
197 obj : object
198 The object to be serialized
198 The object to be serialized
199 threshold : float
199 threshold : float
200 The threshold for not double-pickling the content.
200 The threshold for not double-pickling the content.
201
201
202
202
203 Returns
203 Returns
204 -------
204 -------
205 ('pmd', [bufs]) :
205 ('pmd', [bufs]) :
206 where pmd is the pickled metadata wrapper,
206 where pmd is the pickled metadata wrapper,
207 bufs is a list of data buffers
207 bufs is a list of data buffers
208 """
208 """
209 databuffers = []
209 databuffers = []
210 if isinstance(obj, (list, tuple)):
210 if isinstance(obj, (list, tuple)):
211 clist = canSequence(obj)
211 clist = canSequence(obj)
212 slist = map(serialize, clist)
212 slist = map(serialize, clist)
213 for s in slist:
213 for s in slist:
214 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
214 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
215 databuffers.append(s.getData())
215 databuffers.append(s.getData())
216 s.data = None
216 s.data = None
217 return pickle.dumps(slist,-1), databuffers
217 return pickle.dumps(slist,-1), databuffers
218 elif isinstance(obj, dict):
218 elif isinstance(obj, dict):
219 sobj = {}
219 sobj = {}
220 for k in sorted(obj.iterkeys()):
220 for k in sorted(obj.iterkeys()):
221 s = serialize(can(obj[k]))
221 s = serialize(can(obj[k]))
222 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
222 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
223 databuffers.append(s.getData())
223 databuffers.append(s.getData())
224 s.data = None
224 s.data = None
225 sobj[k] = s
225 sobj[k] = s
226 return pickle.dumps(sobj,-1),databuffers
226 return pickle.dumps(sobj,-1),databuffers
227 else:
227 else:
228 s = serialize(can(obj))
228 s = serialize(can(obj))
229 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
229 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
230 databuffers.append(s.getData())
230 databuffers.append(s.getData())
231 s.data = None
231 s.data = None
232 return pickle.dumps(s,-1),databuffers
232 return pickle.dumps(s,-1),databuffers
233
233
234
234
235 def unserialize_object(bufs):
235 def unserialize_object(bufs):
236 """reconstruct an object serialized by serialize_object from data buffers."""
236 """reconstruct an object serialized by serialize_object from data buffers."""
237 bufs = list(bufs)
237 bufs = list(bufs)
238 sobj = pickle.loads(bufs.pop(0))
238 sobj = pickle.loads(bufs.pop(0))
239 if isinstance(sobj, (list, tuple)):
239 if isinstance(sobj, (list, tuple)):
240 for s in sobj:
240 for s in sobj:
241 if s.data is None:
241 if s.data is None:
242 s.data = bufs.pop(0)
242 s.data = bufs.pop(0)
243 return uncanSequence(map(unserialize, sobj)), bufs
243 return uncanSequence(map(unserialize, sobj)), bufs
244 elif isinstance(sobj, dict):
244 elif isinstance(sobj, dict):
245 newobj = {}
245 newobj = {}
246 for k in sorted(sobj.iterkeys()):
246 for k in sorted(sobj.iterkeys()):
247 s = sobj[k]
247 s = sobj[k]
248 if s.data is None:
248 if s.data is None:
249 s.data = bufs.pop(0)
249 s.data = bufs.pop(0)
250 newobj[k] = uncan(unserialize(s))
250 newobj[k] = uncan(unserialize(s))
251 return newobj, bufs
251 return newobj, bufs
252 else:
252 else:
253 if sobj.data is None:
253 if sobj.data is None:
254 sobj.data = bufs.pop(0)
254 sobj.data = bufs.pop(0)
255 return uncan(unserialize(sobj)), bufs
255 return uncan(unserialize(sobj)), bufs
256
256
257 def pack_apply_message(f, args, kwargs, threshold=64e-6):
257 def pack_apply_message(f, args, kwargs, threshold=64e-6):
258 """pack up a function, args, and kwargs to be sent over the wire
258 """pack up a function, args, and kwargs to be sent over the wire
259 as a series of buffers. Any object whose data is larger than `threshold`
259 as a series of buffers. Any object whose data is larger than `threshold`
260 will not have their data copied (currently only numpy arrays support zero-copy)"""
260 will not have their data copied (currently only numpy arrays support zero-copy)"""
261 msg = [pickle.dumps(can(f),-1)]
261 msg = [pickle.dumps(can(f),-1)]
262 databuffers = [] # for large objects
262 databuffers = [] # for large objects
263 sargs, bufs = serialize_object(args,threshold)
263 sargs, bufs = serialize_object(args,threshold)
264 msg.append(sargs)
264 msg.append(sargs)
265 databuffers.extend(bufs)
265 databuffers.extend(bufs)
266 skwargs, bufs = serialize_object(kwargs,threshold)
266 skwargs, bufs = serialize_object(kwargs,threshold)
267 msg.append(skwargs)
267 msg.append(skwargs)
268 databuffers.extend(bufs)
268 databuffers.extend(bufs)
269 msg.extend(databuffers)
269 msg.extend(databuffers)
270 return msg
270 return msg
271
271
272 def unpack_apply_message(bufs, g=None, copy=True):
272 def unpack_apply_message(bufs, g=None, copy=True):
273 """unpack f,args,kwargs from buffers packed by pack_apply_message()
273 """unpack f,args,kwargs from buffers packed by pack_apply_message()
274 Returns: original f,args,kwargs"""
274 Returns: original f,args,kwargs"""
275 bufs = list(bufs) # allow us to pop
275 bufs = list(bufs) # allow us to pop
276 assert len(bufs) >= 3, "not enough buffers!"
276 assert len(bufs) >= 3, "not enough buffers!"
277 if not copy:
277 if not copy:
278 for i in range(3):
278 for i in range(3):
279 bufs[i] = bufs[i].bytes
279 bufs[i] = bufs[i].bytes
280 cf = pickle.loads(bufs.pop(0))
280 cf = pickle.loads(bufs.pop(0))
281 sargs = list(pickle.loads(bufs.pop(0)))
281 sargs = list(pickle.loads(bufs.pop(0)))
282 skwargs = dict(pickle.loads(bufs.pop(0)))
282 skwargs = dict(pickle.loads(bufs.pop(0)))
283 # print sargs, skwargs
283 # print sargs, skwargs
284 f = uncan(cf, g)
284 f = uncan(cf, g)
285 for sa in sargs:
285 for sa in sargs:
286 if sa.data is None:
286 if sa.data is None:
287 m = bufs.pop(0)
287 m = bufs.pop(0)
288 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
288 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
289 # always use a buffer, until memoryviews get sorted out
289 # always use a buffer, until memoryviews get sorted out
290 sa.data = buffer(m)
290 sa.data = buffer(m)
291 # disable memoryview support
291 # disable memoryview support
292 # if copy:
292 # if copy:
293 # sa.data = buffer(m)
293 # sa.data = buffer(m)
294 # else:
294 # else:
295 # sa.data = m.buffer
295 # sa.data = m.buffer
296 else:
296 else:
297 if copy:
297 if copy:
298 sa.data = m
298 sa.data = m
299 else:
299 else:
300 sa.data = m.bytes
300 sa.data = m.bytes
301
301
302 args = uncanSequence(map(unserialize, sargs), g)
302 args = uncanSequence(map(unserialize, sargs), g)
303 kwargs = {}
303 kwargs = {}
304 for k in sorted(skwargs.iterkeys()):
304 for k in sorted(skwargs.iterkeys()):
305 sa = skwargs[k]
305 sa = skwargs[k]
306 if sa.data is None:
306 if sa.data is None:
307 m = bufs.pop(0)
307 m = bufs.pop(0)
308 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
308 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
309 # always use a buffer, until memoryviews get sorted out
309 # always use a buffer, until memoryviews get sorted out
310 sa.data = buffer(m)
310 sa.data = buffer(m)
311 # disable memoryview support
311 # disable memoryview support
312 # if copy:
312 # if copy:
313 # sa.data = buffer(m)
313 # sa.data = buffer(m)
314 # else:
314 # else:
315 # sa.data = m.buffer
315 # sa.data = m.buffer
316 else:
316 else:
317 if copy:
317 if copy:
318 sa.data = m
318 sa.data = m
319 else:
319 else:
320 sa.data = m.bytes
320 sa.data = m.bytes
321
321
322 kwargs[k] = uncan(unserialize(sa), g)
322 kwargs[k] = uncan(unserialize(sa), g)
323
323
324 return f,args,kwargs
324 return f,args,kwargs
325
325
326 #--------------------------------------------------------------------------
326 #--------------------------------------------------------------------------
327 # helpers for implementing old MEC API via view.apply
327 # helpers for implementing old MEC API via view.apply
328 #--------------------------------------------------------------------------
328 #--------------------------------------------------------------------------
329
329
330 def interactive(f):
330 def interactive(f):
331 """decorator for making functions appear as interactively defined.
331 """decorator for making functions appear as interactively defined.
332 This results in the function being linked to the user_ns as globals()
332 This results in the function being linked to the user_ns as globals()
333 instead of the module globals().
333 instead of the module globals().
334 """
334 """
335 f.__module__ = '__main__'
335 f.__module__ = '__main__'
336 return f
336 return f
337
337
338 @interactive
338 @interactive
339 def _push(ns):
339 def _push(ns):
340 """helper method for implementing `client.push` via `client.apply`"""
340 """helper method for implementing `client.push` via `client.apply`"""
341 globals().update(ns)
341 globals().update(ns)
342
342
343 @interactive
343 @interactive
344 def _pull(keys):
344 def _pull(keys):
345 """helper method for implementing `client.pull` via `client.apply`"""
345 """helper method for implementing `client.pull` via `client.apply`"""
346 user_ns = globals()
346 user_ns = globals()
347 if isinstance(keys, (list,tuple, set)):
347 if isinstance(keys, (list,tuple, set)):
348 for key in keys:
348 for key in keys:
349 if not user_ns.has_key(key):
349 if not user_ns.has_key(key):
350 raise NameError("name '%s' is not defined"%key)
350 raise NameError("name '%s' is not defined"%key)
351 return map(user_ns.get, keys)
351 return map(user_ns.get, keys)
352 else:
352 else:
353 if not user_ns.has_key(keys):
353 if not user_ns.has_key(keys):
354 raise NameError("name '%s' is not defined"%keys)
354 raise NameError("name '%s' is not defined"%keys)
355 return user_ns.get(keys)
355 return user_ns.get(keys)
356
356
357 @interactive
357 @interactive
358 def _execute(code):
358 def _execute(code):
359 """helper method for implementing `client.execute` via `client.apply`"""
359 """helper method for implementing `client.execute` via `client.apply`"""
360 exec code in globals()
360 exec code in globals()
361
361
362 #--------------------------------------------------------------------------
362 #--------------------------------------------------------------------------
363 # extra process management utilities
363 # extra process management utilities
364 #--------------------------------------------------------------------------
364 #--------------------------------------------------------------------------
365
365
366 _random_ports = set()
366 _random_ports = set()
367
367
368 def select_random_ports(n):
368 def select_random_ports(n):
369 """Selects and return n random ports that are available."""
369 """Selects and return n random ports that are available."""
370 ports = []
370 ports = []
371 for i in xrange(n):
371 for i in xrange(n):
372 sock = socket.socket()
372 sock = socket.socket()
373 sock.bind(('', 0))
373 sock.bind(('', 0))
374 while sock.getsockname()[1] in _random_ports:
374 while sock.getsockname()[1] in _random_ports:
375 sock.close()
375 sock.close()
376 sock = socket.socket()
376 sock = socket.socket()
377 sock.bind(('', 0))
377 sock.bind(('', 0))
378 ports.append(sock)
378 ports.append(sock)
379 for i, sock in enumerate(ports):
379 for i, sock in enumerate(ports):
380 port = sock.getsockname()[1]
380 port = sock.getsockname()[1]
381 sock.close()
381 sock.close()
382 ports[i] = port
382 ports[i] = port
383 _random_ports.add(port)
383 _random_ports.add(port)
384 return ports
384 return ports
385
385
386 def signal_children(children):
386 def signal_children(children):
387 """Relay interupt/term signals to children, for more solid process cleanup."""
387 """Relay interupt/term signals to children, for more solid process cleanup."""
388 def terminate_children(sig, frame):
388 def terminate_children(sig, frame):
389 logging.critical("Got signal %i, terminating children..."%sig)
389 logging.critical("Got signal %i, terminating children..."%sig)
390 for child in children:
390 for child in children:
391 child.terminate()
391 child.terminate()
392
392
393 sys.exit(sig != SIGINT)
393 sys.exit(sig != SIGINT)
394 # sys.exit(sig)
394 # sys.exit(sig)
395 for sig in (SIGINT, SIGABRT, SIGTERM):
395 for sig in (SIGINT, SIGABRT, SIGTERM):
396 signal(sig, terminate_children)
396 signal(sig, terminate_children)
397
397
398 def generate_exec_key(keyfile):
398 def generate_exec_key(keyfile):
399 import uuid
399 import uuid
400 newkey = str(uuid.uuid4())
400 newkey = str(uuid.uuid4())
401 with open(keyfile, 'w') as f:
401 with open(keyfile, 'w') as f:
402 # f.write('ipython-key ')
402 # f.write('ipython-key ')
403 f.write(newkey+'\n')
403 f.write(newkey+'\n')
404 # set user-only RW permissions (0600)
404 # set user-only RW permissions (0600)
405 # this will have no effect on Windows
405 # this will have no effect on Windows
406 os.chmod(keyfile, stat.S_IRUSR|stat.S_IWUSR)
406 os.chmod(keyfile, stat.S_IRUSR|stat.S_IWUSR)
407
407
408
408
409 def integer_loglevel(loglevel):
409 def integer_loglevel(loglevel):
410 try:
410 try:
411 loglevel = int(loglevel)
411 loglevel = int(loglevel)
412 except ValueError:
412 except ValueError:
413 if isinstance(loglevel, str):
413 if isinstance(loglevel, str):
414 loglevel = getattr(logging, loglevel)
414 loglevel = getattr(logging, loglevel)
415 return loglevel
415 return loglevel
416
416
417 def connect_logger(logname, context, iface, root="ip", loglevel=logging.DEBUG):
417 def connect_logger(logname, context, iface, root="ip", loglevel=logging.DEBUG):
418 logger = logging.getLogger(logname)
418 logger = logging.getLogger(logname)
419 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
419 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
420 # don't add a second PUBHandler
420 # don't add a second PUBHandler
421 return
421 return
422 loglevel = integer_loglevel(loglevel)
422 loglevel = integer_loglevel(loglevel)
423 lsock = context.socket(zmq.PUB)
423 lsock = context.socket(zmq.PUB)
424 lsock.connect(iface)
424 lsock.connect(iface)
425 handler = handlers.PUBHandler(lsock)
425 handler = handlers.PUBHandler(lsock)
426 handler.setLevel(loglevel)
426 handler.setLevel(loglevel)
427 handler.root_topic = root
427 handler.root_topic = root
428 logger.addHandler(handler)
428 logger.addHandler(handler)
429 logger.setLevel(loglevel)
429 logger.setLevel(loglevel)
430
430
431 def connect_engine_logger(context, iface, engine, loglevel=logging.DEBUG):
431 def connect_engine_logger(context, iface, engine, loglevel=logging.DEBUG):
432 logger = logging.getLogger()
432 logger = logging.getLogger()
433 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
433 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
434 # don't add a second PUBHandler
434 # don't add a second PUBHandler
435 return
435 return
436 loglevel = integer_loglevel(loglevel)
436 loglevel = integer_loglevel(loglevel)
437 lsock = context.socket(zmq.PUB)
437 lsock = context.socket(zmq.PUB)
438 lsock.connect(iface)
438 lsock.connect(iface)
439 handler = EnginePUBHandler(engine, lsock)
439 handler = EnginePUBHandler(engine, lsock)
440 handler.setLevel(loglevel)
440 handler.setLevel(loglevel)
441 logger.addHandler(handler)
441 logger.addHandler(handler)
442 logger.setLevel(loglevel)
442 logger.setLevel(loglevel)
443 return logger
443 return logger
444
444
445 def local_logger(logname, loglevel=logging.DEBUG):
445 def local_logger(logname, loglevel=logging.DEBUG):
446 loglevel = integer_loglevel(loglevel)
446 loglevel = integer_loglevel(loglevel)
447 logger = logging.getLogger(logname)
447 logger = logging.getLogger(logname)
448 if any([isinstance(h, logging.StreamHandler) for h in logger.handlers]):
448 if any([isinstance(h, logging.StreamHandler) for h in logger.handlers]):
449 # don't add a second StreamHandler
449 # don't add a second StreamHandler
450 return
450 return
451 handler = logging.StreamHandler()
451 handler = logging.StreamHandler()
452 handler.setLevel(loglevel)
452 handler.setLevel(loglevel)
453 logger.addHandler(handler)
453 logger.addHandler(handler)
454 logger.setLevel(loglevel)
454 logger.setLevel(loglevel)
455 return logger
455 return logger
456
456
General Comments 0
You need to be logged in to leave comments. Login now