##// END OF EJS Templates
enforce ascii identities in parallel code...
MinRK -
Show More
@@ -1,173 +1,173
1 1 #!/usr/bin/env python
2 2 """
3 3 A multi-heart Heartbeat system using PUB and XREP sockets. pings are sent out on the PUB,
4 4 and hearts are tracked based on their XREQ identities.
5 5
6 6 Authors:
7 7
8 8 * Min RK
9 9 """
10 10 #-----------------------------------------------------------------------------
11 11 # Copyright (C) 2010-2011 The IPython Development Team
12 12 #
13 13 # Distributed under the terms of the BSD License. The full license is in
14 14 # the file COPYING, distributed as part of this software.
15 15 #-----------------------------------------------------------------------------
16 16
17 17 from __future__ import print_function
18 18 import time
19 19 import uuid
20 20
21 21 import zmq
22 22 from zmq.devices import ThreadDevice
23 23 from zmq.eventloop import ioloop, zmqstream
24 24
25 25 from IPython.config.configurable import LoggingConfigurable
26 26 from IPython.utils.traitlets import Set, Instance, CFloat
27 27
28 28 from IPython.parallel.util import ensure_bytes
29 29
30 30 class Heart(object):
31 31 """A basic heart object for responding to a HeartMonitor.
32 32 This is a simple wrapper with defaults for the most common
33 33 Device model for responding to heartbeats.
34 34
35 35 It simply builds a threadsafe zmq.FORWARDER Device, defaulting to using
36 36 SUB/XREQ for in/out.
37 37
38 38 You can specify the XREQ's IDENTITY via the optional heart_id argument."""
39 39 device=None
40 40 id=None
41 41 def __init__(self, in_addr, out_addr, in_type=zmq.SUB, out_type=zmq.XREQ, heart_id=None):
42 42 self.device = ThreadDevice(zmq.FORWARDER, in_type, out_type)
43 43 self.device.daemon=True
44 44 self.device.connect_in(in_addr)
45 45 self.device.connect_out(out_addr)
46 46 if in_type == zmq.SUB:
47 47 self.device.setsockopt_in(zmq.SUBSCRIBE, b"")
48 48 if heart_id is None:
49 heart_id = ensure_bytes(uuid.uuid4())
49 heart_id = uuid.uuid4().bytes
50 50 self.device.setsockopt_out(zmq.IDENTITY, heart_id)
51 51 self.id = heart_id
52 52
53 53 def start(self):
54 54 return self.device.start()
55 55
56 56 class HeartMonitor(LoggingConfigurable):
57 57 """A basic HeartMonitor class
58 58 pingstream: a PUB stream
59 59 pongstream: an XREP stream
60 60 period: the period of the heartbeat in milliseconds"""
61 61
62 62 period=CFloat(1000, config=True,
63 63 help='The frequency at which the Hub pings the engines for heartbeats '
64 64 ' (in ms) [default: 100]',
65 65 )
66 66
67 67 pingstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
68 68 pongstream=Instance('zmq.eventloop.zmqstream.ZMQStream')
69 69 loop = Instance('zmq.eventloop.ioloop.IOLoop')
70 70 def _loop_default(self):
71 71 return ioloop.IOLoop.instance()
72 72
73 73 # not settable:
74 74 hearts=Set()
75 75 responses=Set()
76 76 on_probation=Set()
77 77 last_ping=CFloat(0)
78 78 _new_handlers = Set()
79 79 _failure_handlers = Set()
80 80 lifetime = CFloat(0)
81 81 tic = CFloat(0)
82 82
83 83 def __init__(self, **kwargs):
84 84 super(HeartMonitor, self).__init__(**kwargs)
85 85
86 86 self.pongstream.on_recv(self.handle_pong)
87 87
88 88 def start(self):
89 89 self.caller = ioloop.PeriodicCallback(self.beat, self.period, self.loop)
90 90 self.caller.start()
91 91
92 92 def add_new_heart_handler(self, handler):
93 93 """add a new handler for new hearts"""
94 94 self.log.debug("heartbeat::new_heart_handler: %s"%handler)
95 95 self._new_handlers.add(handler)
96 96
97 97 def add_heart_failure_handler(self, handler):
98 98 """add a new handler for heart failure"""
99 99 self.log.debug("heartbeat::new heart failure handler: %s"%handler)
100 100 self._failure_handlers.add(handler)
101 101
102 102 def beat(self):
103 103 self.pongstream.flush()
104 104 self.last_ping = self.lifetime
105 105
106 106 toc = time.time()
107 107 self.lifetime += toc-self.tic
108 108 self.tic = toc
109 109 # self.log.debug("heartbeat::%s"%self.lifetime)
110 110 goodhearts = self.hearts.intersection(self.responses)
111 111 missed_beats = self.hearts.difference(goodhearts)
112 112 heartfailures = self.on_probation.intersection(missed_beats)
113 113 newhearts = self.responses.difference(goodhearts)
114 114 map(self.handle_new_heart, newhearts)
115 115 map(self.handle_heart_failure, heartfailures)
116 116 self.on_probation = missed_beats.intersection(self.hearts)
117 117 self.responses = set()
118 118 # print self.on_probation, self.hearts
119 119 # self.log.debug("heartbeat::beat %.3f, %i beating hearts"%(self.lifetime, len(self.hearts)))
120 120 self.pingstream.send(ensure_bytes(str(self.lifetime)))
121 121
122 122 def handle_new_heart(self, heart):
123 123 if self._new_handlers:
124 124 for handler in self._new_handlers:
125 125 handler(heart)
126 126 else:
127 127 self.log.info("heartbeat::yay, got new heart %s!"%heart)
128 128 self.hearts.add(heart)
129 129
130 130 def handle_heart_failure(self, heart):
131 131 if self._failure_handlers:
132 132 for handler in self._failure_handlers:
133 133 try:
134 134 handler(heart)
135 135 except Exception as e:
136 136 self.log.error("heartbeat::Bad Handler! %s"%handler, exc_info=True)
137 137 pass
138 138 else:
139 139 self.log.info("heartbeat::Heart %s failed :("%heart)
140 140 self.hearts.remove(heart)
141 141
142 142
143 143 def handle_pong(self, msg):
144 144 "a heart just beat"
145 145 current = ensure_bytes(str(self.lifetime))
146 146 last = ensure_bytes(str(self.last_ping))
147 147 if msg[1] == current:
148 148 delta = time.time()-self.tic
149 149 # self.log.debug("heartbeat::heart %r took %.2f ms to respond"%(msg[0], 1000*delta))
150 150 self.responses.add(msg[0])
151 151 elif msg[1] == last:
152 152 delta = time.time()-self.tic + (self.lifetime-self.last_ping)
153 153 self.log.warn("heartbeat::heart %r missed a beat, and took %.2f ms to respond"%(msg[0], 1000*delta))
154 154 self.responses.add(msg[0])
155 155 else:
156 156 self.log.warn("heartbeat::got bad heartbeat (possibly old?): %s (current=%.3f)"%
157 157 (msg[1],self.lifetime))
158 158
159 159
160 160 if __name__ == '__main__':
161 161 loop = ioloop.IOLoop.instance()
162 162 context = zmq.Context()
163 163 pub = context.socket(zmq.PUB)
164 164 pub.bind('tcp://127.0.0.1:5555')
165 165 xrep = context.socket(zmq.XREP)
166 166 xrep.bind('tcp://127.0.0.1:5556')
167 167
168 168 outstream = zmqstream.ZMQStream(pub, loop)
169 169 instream = zmqstream.ZMQStream(xrep, loop)
170 170
171 171 hb = HeartMonitor(loop, outstream, instream)
172 172
173 173 loop.start()
@@ -1,1291 +1,1291
1 1 #!/usr/bin/env python
2 2 """The IPython Controller Hub with 0MQ
3 3 This is the master object that handles connections from engines and clients,
4 4 and monitors traffic through the various queues.
5 5
6 6 Authors:
7 7
8 8 * Min RK
9 9 """
10 10 #-----------------------------------------------------------------------------
11 11 # Copyright (C) 2010 The IPython Development Team
12 12 #
13 13 # Distributed under the terms of the BSD License. The full license is in
14 14 # the file COPYING, distributed as part of this software.
15 15 #-----------------------------------------------------------------------------
16 16
17 17 #-----------------------------------------------------------------------------
18 18 # Imports
19 19 #-----------------------------------------------------------------------------
20 20 from __future__ import print_function
21 21
22 22 import sys
23 23 import time
24 24 from datetime import datetime
25 25
26 26 import zmq
27 27 from zmq.eventloop import ioloop
28 28 from zmq.eventloop.zmqstream import ZMQStream
29 29
30 30 # internal:
31 31 from IPython.utils.importstring import import_item
32 32 from IPython.utils.traitlets import (
33 33 HasTraits, Instance, Int, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
34 34 )
35 35
36 36 from IPython.parallel import error, util
37 37 from IPython.parallel.factory import RegistrationFactory
38 38
39 39 from IPython.zmq.session import SessionFactory
40 40
41 41 from .heartmonitor import HeartMonitor
42 42
43 43 #-----------------------------------------------------------------------------
44 44 # Code
45 45 #-----------------------------------------------------------------------------
46 46
47 47 def _passer(*args, **kwargs):
48 48 return
49 49
50 50 def _printer(*args, **kwargs):
51 51 print (args)
52 52 print (kwargs)
53 53
54 54 def empty_record():
55 55 """Return an empty dict with all record keys."""
56 56 return {
57 57 'msg_id' : None,
58 58 'header' : None,
59 59 'content': None,
60 60 'buffers': None,
61 61 'submitted': None,
62 62 'client_uuid' : None,
63 63 'engine_uuid' : None,
64 64 'started': None,
65 65 'completed': None,
66 66 'resubmitted': None,
67 67 'result_header' : None,
68 68 'result_content' : None,
69 69 'result_buffers' : None,
70 70 'queue' : None,
71 71 'pyin' : None,
72 72 'pyout': None,
73 73 'pyerr': None,
74 74 'stdout': '',
75 75 'stderr': '',
76 76 }
77 77
78 78 def init_record(msg):
79 79 """Initialize a TaskRecord based on a request."""
80 80 header = msg['header']
81 81 return {
82 82 'msg_id' : header['msg_id'],
83 83 'header' : header,
84 84 'content': msg['content'],
85 85 'buffers': msg['buffers'],
86 86 'submitted': header['date'],
87 87 'client_uuid' : None,
88 88 'engine_uuid' : None,
89 89 'started': None,
90 90 'completed': None,
91 91 'resubmitted': None,
92 92 'result_header' : None,
93 93 'result_content' : None,
94 94 'result_buffers' : None,
95 95 'queue' : None,
96 96 'pyin' : None,
97 97 'pyout': None,
98 98 'pyerr': None,
99 99 'stdout': '',
100 100 'stderr': '',
101 101 }
102 102
103 103
104 104 class EngineConnector(HasTraits):
105 105 """A simple object for accessing the various zmq connections of an object.
106 106 Attributes are:
107 107 id (int): engine ID
108 108 uuid (str): uuid (unused?)
109 109 queue (str): identity of queue's XREQ socket
110 110 registration (str): identity of registration XREQ socket
111 111 heartbeat (str): identity of heartbeat XREQ socket
112 112 """
113 113 id=Int(0)
114 114 queue=CBytes()
115 115 control=CBytes()
116 116 registration=CBytes()
117 117 heartbeat=CBytes()
118 118 pending=Set()
119 119
120 120 class HubFactory(RegistrationFactory):
121 121 """The Configurable for setting up a Hub."""
122 122
123 123 # port-pairs for monitoredqueues:
124 124 hb = Tuple(Int,Int,config=True,
125 125 help="""XREQ/SUB Port pair for Engine heartbeats""")
126 126 def _hb_default(self):
127 127 return tuple(util.select_random_ports(2))
128 128
129 129 mux = Tuple(Int,Int,config=True,
130 130 help="""Engine/Client Port pair for MUX queue""")
131 131
132 132 def _mux_default(self):
133 133 return tuple(util.select_random_ports(2))
134 134
135 135 task = Tuple(Int,Int,config=True,
136 136 help="""Engine/Client Port pair for Task queue""")
137 137 def _task_default(self):
138 138 return tuple(util.select_random_ports(2))
139 139
140 140 control = Tuple(Int,Int,config=True,
141 141 help="""Engine/Client Port pair for Control queue""")
142 142
143 143 def _control_default(self):
144 144 return tuple(util.select_random_ports(2))
145 145
146 146 iopub = Tuple(Int,Int,config=True,
147 147 help="""Engine/Client Port pair for IOPub relay""")
148 148
149 149 def _iopub_default(self):
150 150 return tuple(util.select_random_ports(2))
151 151
152 152 # single ports:
153 153 mon_port = Int(config=True,
154 154 help="""Monitor (SUB) port for queue traffic""")
155 155
156 156 def _mon_port_default(self):
157 157 return util.select_random_ports(1)[0]
158 158
159 159 notifier_port = Int(config=True,
160 160 help="""PUB port for sending engine status notifications""")
161 161
162 162 def _notifier_port_default(self):
163 163 return util.select_random_ports(1)[0]
164 164
165 165 engine_ip = Unicode('127.0.0.1', config=True,
166 166 help="IP on which to listen for engine connections. [default: loopback]")
167 167 engine_transport = Unicode('tcp', config=True,
168 168 help="0MQ transport for engine connections. [default: tcp]")
169 169
170 170 client_ip = Unicode('127.0.0.1', config=True,
171 171 help="IP on which to listen for client connections. [default: loopback]")
172 172 client_transport = Unicode('tcp', config=True,
173 173 help="0MQ transport for client connections. [default : tcp]")
174 174
175 175 monitor_ip = Unicode('127.0.0.1', config=True,
176 176 help="IP on which to listen for monitor messages. [default: loopback]")
177 177 monitor_transport = Unicode('tcp', config=True,
178 178 help="0MQ transport for monitor messages. [default : tcp]")
179 179
180 180 monitor_url = Unicode('')
181 181
182 182 db_class = DottedObjectName('IPython.parallel.controller.dictdb.DictDB',
183 183 config=True, help="""The class to use for the DB backend""")
184 184
185 185 # not configurable
186 186 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
187 187 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
188 188
189 189 def _ip_changed(self, name, old, new):
190 190 self.engine_ip = new
191 191 self.client_ip = new
192 192 self.monitor_ip = new
193 193 self._update_monitor_url()
194 194
195 195 def _update_monitor_url(self):
196 196 self.monitor_url = "%s://%s:%i"%(self.monitor_transport, self.monitor_ip, self.mon_port)
197 197
198 198 def _transport_changed(self, name, old, new):
199 199 self.engine_transport = new
200 200 self.client_transport = new
201 201 self.monitor_transport = new
202 202 self._update_monitor_url()
203 203
204 204 def __init__(self, **kwargs):
205 205 super(HubFactory, self).__init__(**kwargs)
206 206 self._update_monitor_url()
207 207
208 208
209 209 def construct(self):
210 210 self.init_hub()
211 211
212 212 def start(self):
213 213 self.heartmonitor.start()
214 214 self.log.info("Heartmonitor started")
215 215
216 216 def init_hub(self):
217 217 """construct"""
218 218 client_iface = "%s://%s:"%(self.client_transport, self.client_ip) + "%i"
219 219 engine_iface = "%s://%s:"%(self.engine_transport, self.engine_ip) + "%i"
220 220
221 221 ctx = self.context
222 222 loop = self.loop
223 223
224 224 # Registrar socket
225 225 q = ZMQStream(ctx.socket(zmq.XREP), loop)
226 226 q.bind(client_iface % self.regport)
227 227 self.log.info("Hub listening on %s for registration."%(client_iface%self.regport))
228 228 if self.client_ip != self.engine_ip:
229 229 q.bind(engine_iface % self.regport)
230 230 self.log.info("Hub listening on %s for registration."%(engine_iface%self.regport))
231 231
232 232 ### Engine connections ###
233 233
234 234 # heartbeat
235 235 hpub = ctx.socket(zmq.PUB)
236 236 hpub.bind(engine_iface % self.hb[0])
237 237 hrep = ctx.socket(zmq.XREP)
238 238 hrep.bind(engine_iface % self.hb[1])
239 239 self.heartmonitor = HeartMonitor(loop=loop, config=self.config, log=self.log,
240 240 pingstream=ZMQStream(hpub,loop),
241 241 pongstream=ZMQStream(hrep,loop)
242 242 )
243 243
244 244 ### Client connections ###
245 245 # Notifier socket
246 246 n = ZMQStream(ctx.socket(zmq.PUB), loop)
247 247 n.bind(client_iface%self.notifier_port)
248 248
249 249 ### build and launch the queues ###
250 250
251 251 # monitor socket
252 252 sub = ctx.socket(zmq.SUB)
253 253 sub.setsockopt(zmq.SUBSCRIBE, b"")
254 254 sub.bind(self.monitor_url)
255 255 sub.bind('inproc://monitor')
256 256 sub = ZMQStream(sub, loop)
257 257
258 258 # connect the db
259 259 self.log.info('Hub using DB backend: %r'%(self.db_class.split()[-1]))
260 260 # cdir = self.config.Global.cluster_dir
261 261 self.db = import_item(str(self.db_class))(session=self.session.session,
262 262 config=self.config, log=self.log)
263 263 time.sleep(.25)
264 264 try:
265 265 scheme = self.config.TaskScheduler.scheme_name
266 266 except AttributeError:
267 267 from .scheduler import TaskScheduler
268 268 scheme = TaskScheduler.scheme_name.get_default_value()
269 269 # build connection dicts
270 270 self.engine_info = {
271 271 'control' : engine_iface%self.control[1],
272 272 'mux': engine_iface%self.mux[1],
273 273 'heartbeat': (engine_iface%self.hb[0], engine_iface%self.hb[1]),
274 274 'task' : engine_iface%self.task[1],
275 275 'iopub' : engine_iface%self.iopub[1],
276 276 # 'monitor' : engine_iface%self.mon_port,
277 277 }
278 278
279 279 self.client_info = {
280 280 'control' : client_iface%self.control[0],
281 281 'mux': client_iface%self.mux[0],
282 282 'task' : (scheme, client_iface%self.task[0]),
283 283 'iopub' : client_iface%self.iopub[0],
284 284 'notification': client_iface%self.notifier_port
285 285 }
286 286 self.log.debug("Hub engine addrs: %s"%self.engine_info)
287 287 self.log.debug("Hub client addrs: %s"%self.client_info)
288 288
289 289 # resubmit stream
290 290 r = ZMQStream(ctx.socket(zmq.XREQ), loop)
291 291 url = util.disambiguate_url(self.client_info['task'][-1])
292 292 r.setsockopt(zmq.IDENTITY, util.ensure_bytes(self.session.session))
293 293 r.connect(url)
294 294
295 295 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
296 296 query=q, notifier=n, resubmit=r, db=self.db,
297 297 engine_info=self.engine_info, client_info=self.client_info,
298 298 log=self.log)
299 299
300 300
301 301 class Hub(SessionFactory):
302 302 """The IPython Controller Hub with 0MQ connections
303 303
304 304 Parameters
305 305 ==========
306 306 loop: zmq IOLoop instance
307 307 session: Session object
308 308 <removed> context: zmq context for creating new connections (?)
309 309 queue: ZMQStream for monitoring the command queue (SUB)
310 310 query: ZMQStream for engine registration and client queries requests (XREP)
311 311 heartbeat: HeartMonitor object checking the pulse of the engines
312 312 notifier: ZMQStream for broadcasting engine registration changes (PUB)
313 313 db: connection to db for out of memory logging of commands
314 314 NotImplemented
315 315 engine_info: dict of zmq connection information for engines to connect
316 316 to the queues.
317 317 client_info: dict of zmq connection information for engines to connect
318 318 to the queues.
319 319 """
320 320 # internal data structures:
321 321 ids=Set() # engine IDs
322 322 keytable=Dict()
323 323 by_ident=Dict()
324 324 engines=Dict()
325 325 clients=Dict()
326 326 hearts=Dict()
327 327 pending=Set()
328 328 queues=Dict() # pending msg_ids keyed by engine_id
329 329 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
330 330 completed=Dict() # completed msg_ids keyed by engine_id
331 331 all_completed=Set() # completed msg_ids keyed by engine_id
332 332 dead_engines=Set() # completed msg_ids keyed by engine_id
333 333 unassigned=Set() # set of task msg_ds not yet assigned a destination
334 334 incoming_registrations=Dict()
335 335 registration_timeout=Int()
336 336 _idcounter=Int(0)
337 337
338 338 # objects from constructor:
339 339 query=Instance(ZMQStream)
340 340 monitor=Instance(ZMQStream)
341 341 notifier=Instance(ZMQStream)
342 342 resubmit=Instance(ZMQStream)
343 343 heartmonitor=Instance(HeartMonitor)
344 344 db=Instance(object)
345 345 client_info=Dict()
346 346 engine_info=Dict()
347 347
348 348
349 349 def __init__(self, **kwargs):
350 350 """
351 351 # universal:
352 352 loop: IOLoop for creating future connections
353 353 session: streamsession for sending serialized data
354 354 # engine:
355 355 queue: ZMQStream for monitoring queue messages
356 356 query: ZMQStream for engine+client registration and client requests
357 357 heartbeat: HeartMonitor object for tracking engines
358 358 # extra:
359 359 db: ZMQStream for db connection (NotImplemented)
360 360 engine_info: zmq address/protocol dict for engine connections
361 361 client_info: zmq address/protocol dict for client connections
362 362 """
363 363
364 364 super(Hub, self).__init__(**kwargs)
365 365 self.registration_timeout = max(5000, 2*self.heartmonitor.period)
366 366
367 367 # validate connection dicts:
368 368 for k,v in self.client_info.iteritems():
369 369 if k == 'task':
370 370 util.validate_url_container(v[1])
371 371 else:
372 372 util.validate_url_container(v)
373 373 # util.validate_url_container(self.client_info)
374 374 util.validate_url_container(self.engine_info)
375 375
376 376 # register our callbacks
377 377 self.query.on_recv(self.dispatch_query)
378 378 self.monitor.on_recv(self.dispatch_monitor_traffic)
379 379
380 380 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
381 381 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
382 382
383 383 self.monitor_handlers = {b'in' : self.save_queue_request,
384 384 b'out': self.save_queue_result,
385 385 b'intask': self.save_task_request,
386 386 b'outtask': self.save_task_result,
387 387 b'tracktask': self.save_task_destination,
388 388 b'incontrol': _passer,
389 389 b'outcontrol': _passer,
390 390 b'iopub': self.save_iopub_message,
391 391 }
392 392
393 393 self.query_handlers = {'queue_request': self.queue_status,
394 394 'result_request': self.get_results,
395 395 'history_request': self.get_history,
396 396 'db_request': self.db_query,
397 397 'purge_request': self.purge_results,
398 398 'load_request': self.check_load,
399 399 'resubmit_request': self.resubmit_task,
400 400 'shutdown_request': self.shutdown_request,
401 401 'registration_request' : self.register_engine,
402 402 'unregistration_request' : self.unregister_engine,
403 403 'connection_request': self.connection_request,
404 404 }
405 405
406 406 # ignore resubmit replies
407 407 self.resubmit.on_recv(lambda msg: None, copy=False)
408 408
409 409 self.log.info("hub::created hub")
410 410
411 411 @property
412 412 def _next_id(self):
413 413 """gemerate a new ID.
414 414
415 415 No longer reuse old ids, just count from 0."""
416 416 newid = self._idcounter
417 417 self._idcounter += 1
418 418 return newid
419 419 # newid = 0
420 420 # incoming = [id[0] for id in self.incoming_registrations.itervalues()]
421 421 # # print newid, self.ids, self.incoming_registrations
422 422 # while newid in self.ids or newid in incoming:
423 423 # newid += 1
424 424 # return newid
425 425
426 426 #-----------------------------------------------------------------------------
427 427 # message validation
428 428 #-----------------------------------------------------------------------------
429 429
430 430 def _validate_targets(self, targets):
431 431 """turn any valid targets argument into a list of integer ids"""
432 432 if targets is None:
433 433 # default to all
434 434 targets = self.ids
435 435
436 436 if isinstance(targets, (int,str,unicode)):
437 437 # only one target specified
438 438 targets = [targets]
439 439 _targets = []
440 440 for t in targets:
441 441 # map raw identities to ids
442 442 if isinstance(t, (str,unicode)):
443 443 t = self.by_ident.get(t, t)
444 444 _targets.append(t)
445 445 targets = _targets
446 446 bad_targets = [ t for t in targets if t not in self.ids ]
447 447 if bad_targets:
448 448 raise IndexError("No Such Engine: %r"%bad_targets)
449 449 if not targets:
450 450 raise IndexError("No Engines Registered")
451 451 return targets
452 452
453 453 #-----------------------------------------------------------------------------
454 454 # dispatch methods (1 per stream)
455 455 #-----------------------------------------------------------------------------
456 456
457 457
458 458 def dispatch_monitor_traffic(self, msg):
459 459 """all ME and Task queue messages come through here, as well as
460 460 IOPub traffic."""
461 461 self.log.debug("monitor traffic: %r"%msg[:2])
462 462 switch = msg[0]
463 463 try:
464 464 idents, msg = self.session.feed_identities(msg[1:])
465 465 except ValueError:
466 466 idents=[]
467 467 if not idents:
468 468 self.log.error("Bad Monitor Message: %r"%msg)
469 469 return
470 470 handler = self.monitor_handlers.get(switch, None)
471 471 if handler is not None:
472 472 handler(idents, msg)
473 473 else:
474 474 self.log.error("Invalid monitor topic: %r"%switch)
475 475
476 476
477 477 def dispatch_query(self, msg):
478 478 """Route registration requests and queries from clients."""
479 479 try:
480 480 idents, msg = self.session.feed_identities(msg)
481 481 except ValueError:
482 482 idents = []
483 483 if not idents:
484 484 self.log.error("Bad Query Message: %r"%msg)
485 485 return
486 486 client_id = idents[0]
487 487 try:
488 488 msg = self.session.unpack_message(msg, content=True)
489 489 except Exception:
490 490 content = error.wrap_exception()
491 491 self.log.error("Bad Query Message: %r"%msg, exc_info=True)
492 492 self.session.send(self.query, "hub_error", ident=client_id,
493 493 content=content)
494 494 return
495 495 # print client_id, header, parent, content
496 496 #switch on message type:
497 497 msg_type = msg['msg_type']
498 498 self.log.info("client::client %r requested %r"%(client_id, msg_type))
499 499 handler = self.query_handlers.get(msg_type, None)
500 500 try:
501 501 assert handler is not None, "Bad Message Type: %r"%msg_type
502 502 except:
503 503 content = error.wrap_exception()
504 504 self.log.error("Bad Message Type: %r"%msg_type, exc_info=True)
505 505 self.session.send(self.query, "hub_error", ident=client_id,
506 506 content=content)
507 507 return
508 508
509 509 else:
510 510 handler(idents, msg)
511 511
512 512 def dispatch_db(self, msg):
513 513 """"""
514 514 raise NotImplementedError
515 515
516 516 #---------------------------------------------------------------------------
517 517 # handler methods (1 per event)
518 518 #---------------------------------------------------------------------------
519 519
520 520 #----------------------- Heartbeat --------------------------------------
521 521
522 522 def handle_new_heart(self, heart):
523 523 """handler to attach to heartbeater.
524 524 Called when a new heart starts to beat.
525 525 Triggers completion of registration."""
526 526 self.log.debug("heartbeat::handle_new_heart(%r)"%heart)
527 527 if heart not in self.incoming_registrations:
528 528 self.log.info("heartbeat::ignoring new heart: %r"%heart)
529 529 else:
530 530 self.finish_registration(heart)
531 531
532 532
533 533 def handle_heart_failure(self, heart):
534 534 """handler to attach to heartbeater.
535 535 called when a previously registered heart fails to respond to beat request.
536 536 triggers unregistration"""
537 537 self.log.debug("heartbeat::handle_heart_failure(%r)"%heart)
538 538 eid = self.hearts.get(heart, None)
539 539 queue = self.engines[eid].queue
540 540 if eid is None:
541 541 self.log.info("heartbeat::ignoring heart failure %r"%heart)
542 542 else:
543 543 self.unregister_engine(heart, dict(content=dict(id=eid, queue=queue)))
544 544
545 545 #----------------------- MUX Queue Traffic ------------------------------
546 546
547 547 def save_queue_request(self, idents, msg):
548 548 if len(idents) < 2:
549 549 self.log.error("invalid identity prefix: %r"%idents)
550 550 return
551 551 queue_id, client_id = idents[:2]
552 552 try:
553 553 msg = self.session.unpack_message(msg)
554 554 except Exception:
555 555 self.log.error("queue::client %r sent invalid message to %r: %r"%(client_id, queue_id, msg), exc_info=True)
556 556 return
557 557
558 558 eid = self.by_ident.get(queue_id, None)
559 559 if eid is None:
560 560 self.log.error("queue::target %r not registered"%queue_id)
561 561 self.log.debug("queue:: valid are: %r"%(self.by_ident.keys()))
562 562 return
563 563 record = init_record(msg)
564 564 msg_id = record['msg_id']
565 565 # Unicode in records
566 record['engine_uuid'] = queue_id.decode('utf8', 'replace')
567 record['client_uuid'] = client_id.decode('utf8', 'replace')
566 record['engine_uuid'] = queue_id.decode('ascii')
567 record['client_uuid'] = client_id.decode('ascii')
568 568 record['queue'] = 'mux'
569 569
570 570 try:
571 571 # it's posible iopub arrived first:
572 572 existing = self.db.get_record(msg_id)
573 573 for key,evalue in existing.iteritems():
574 574 rvalue = record.get(key, None)
575 575 if evalue and rvalue and evalue != rvalue:
576 576 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
577 577 elif evalue and not rvalue:
578 578 record[key] = evalue
579 579 try:
580 580 self.db.update_record(msg_id, record)
581 581 except Exception:
582 582 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
583 583 except KeyError:
584 584 try:
585 585 self.db.add_record(msg_id, record)
586 586 except Exception:
587 587 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
588 588
589 589
590 590 self.pending.add(msg_id)
591 591 self.queues[eid].append(msg_id)
592 592
593 593 def save_queue_result(self, idents, msg):
594 594 if len(idents) < 2:
595 595 self.log.error("invalid identity prefix: %r"%idents)
596 596 return
597 597
598 598 client_id, queue_id = idents[:2]
599 599 try:
600 600 msg = self.session.unpack_message(msg)
601 601 except Exception:
602 602 self.log.error("queue::engine %r sent invalid message to %r: %r"%(
603 603 queue_id,client_id, msg), exc_info=True)
604 604 return
605 605
606 606 eid = self.by_ident.get(queue_id, None)
607 607 if eid is None:
608 608 self.log.error("queue::unknown engine %r is sending a reply: "%queue_id)
609 609 return
610 610
611 611 parent = msg['parent_header']
612 612 if not parent:
613 613 return
614 614 msg_id = parent['msg_id']
615 615 if msg_id in self.pending:
616 616 self.pending.remove(msg_id)
617 617 self.all_completed.add(msg_id)
618 618 self.queues[eid].remove(msg_id)
619 619 self.completed[eid].append(msg_id)
620 620 elif msg_id not in self.all_completed:
621 621 # it could be a result from a dead engine that died before delivering the
622 622 # result
623 623 self.log.warn("queue:: unknown msg finished %r"%msg_id)
624 624 return
625 625 # update record anyway, because the unregistration could have been premature
626 626 rheader = msg['header']
627 627 completed = rheader['date']
628 628 started = rheader.get('started', None)
629 629 result = {
630 630 'result_header' : rheader,
631 631 'result_content': msg['content'],
632 632 'started' : started,
633 633 'completed' : completed
634 634 }
635 635
636 636 result['result_buffers'] = msg['buffers']
637 637 try:
638 638 self.db.update_record(msg_id, result)
639 639 except Exception:
640 640 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
641 641
642 642
643 643 #--------------------- Task Queue Traffic ------------------------------
644 644
645 645 def save_task_request(self, idents, msg):
646 646 """Save the submission of a task."""
647 647 client_id = idents[0]
648 648
649 649 try:
650 650 msg = self.session.unpack_message(msg)
651 651 except Exception:
652 652 self.log.error("task::client %r sent invalid task message: %r"%(
653 653 client_id, msg), exc_info=True)
654 654 return
655 655 record = init_record(msg)
656 656
657 657 record['client_uuid'] = client_id
658 658 record['queue'] = 'task'
659 659 header = msg['header']
660 660 msg_id = header['msg_id']
661 661 self.pending.add(msg_id)
662 662 self.unassigned.add(msg_id)
663 663 try:
664 664 # it's posible iopub arrived first:
665 665 existing = self.db.get_record(msg_id)
666 666 if existing['resubmitted']:
667 667 for key in ('submitted', 'client_uuid', 'buffers'):
668 668 # don't clobber these keys on resubmit
669 669 # submitted and client_uuid should be different
670 670 # and buffers might be big, and shouldn't have changed
671 671 record.pop(key)
672 672 # still check content,header which should not change
673 673 # but are not expensive to compare as buffers
674 674
675 675 for key,evalue in existing.iteritems():
676 676 if key.endswith('buffers'):
677 677 # don't compare buffers
678 678 continue
679 679 rvalue = record.get(key, None)
680 680 if evalue and rvalue and evalue != rvalue:
681 681 self.log.warn("conflicting initial state for record: %r:%r <%r> %r"%(msg_id, rvalue, key, evalue))
682 682 elif evalue and not rvalue:
683 683 record[key] = evalue
684 684 try:
685 685 self.db.update_record(msg_id, record)
686 686 except Exception:
687 687 self.log.error("DB Error updating record %r"%msg_id, exc_info=True)
688 688 except KeyError:
689 689 try:
690 690 self.db.add_record(msg_id, record)
691 691 except Exception:
692 692 self.log.error("DB Error adding record %r"%msg_id, exc_info=True)
693 693 except Exception:
694 694 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
695 695
696 696 def save_task_result(self, idents, msg):
697 697 """save the result of a completed task."""
698 698 client_id = idents[0]
699 699 try:
700 700 msg = self.session.unpack_message(msg)
701 701 except Exception:
702 702 self.log.error("task::invalid task result message send to %r: %r"%(
703 703 client_id, msg), exc_info=True)
704 704 return
705 705
706 706 parent = msg['parent_header']
707 707 if not parent:
708 708 # print msg
709 709 self.log.warn("Task %r had no parent!"%msg)
710 710 return
711 711 msg_id = parent['msg_id']
712 712 if msg_id in self.unassigned:
713 713 self.unassigned.remove(msg_id)
714 714
715 715 header = msg['header']
716 716 engine_uuid = header.get('engine', None)
717 717 eid = self.by_ident.get(engine_uuid, None)
718 718
719 719 if msg_id in self.pending:
720 720 self.pending.remove(msg_id)
721 721 self.all_completed.add(msg_id)
722 722 if eid is not None:
723 723 self.completed[eid].append(msg_id)
724 724 if msg_id in self.tasks[eid]:
725 725 self.tasks[eid].remove(msg_id)
726 726 completed = header['date']
727 727 started = header.get('started', None)
728 728 result = {
729 729 'result_header' : header,
730 730 'result_content': msg['content'],
731 731 'started' : started,
732 732 'completed' : completed,
733 733 'engine_uuid': engine_uuid
734 734 }
735 735
736 736 result['result_buffers'] = msg['buffers']
737 737 try:
738 738 self.db.update_record(msg_id, result)
739 739 except Exception:
740 740 self.log.error("DB Error saving task request %r"%msg_id, exc_info=True)
741 741
742 742 else:
743 743 self.log.debug("task::unknown task %r finished"%msg_id)
744 744
745 745 def save_task_destination(self, idents, msg):
746 746 try:
747 747 msg = self.session.unpack_message(msg, content=True)
748 748 except Exception:
749 749 self.log.error("task::invalid task tracking message", exc_info=True)
750 750 return
751 751 content = msg['content']
752 752 # print (content)
753 753 msg_id = content['msg_id']
754 754 engine_uuid = content['engine_id']
755 755 eid = self.by_ident[util.ensure_bytes(engine_uuid)]
756 756
757 757 self.log.info("task::task %r arrived on %r"%(msg_id, eid))
758 758 if msg_id in self.unassigned:
759 759 self.unassigned.remove(msg_id)
760 760 # else:
761 761 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
762 762
763 763 self.tasks[eid].append(msg_id)
764 764 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
765 765 try:
766 766 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
767 767 except Exception:
768 768 self.log.error("DB Error saving task destination %r"%msg_id, exc_info=True)
769 769
770 770
771 771 def mia_task_request(self, idents, msg):
772 772 raise NotImplementedError
773 773 client_id = idents[0]
774 774 # content = dict(mia=self.mia,status='ok')
775 775 # self.session.send('mia_reply', content=content, idents=client_id)
776 776
777 777
778 778 #--------------------- IOPub Traffic ------------------------------
779 779
780 780 def save_iopub_message(self, topics, msg):
781 781 """save an iopub message into the db"""
782 782 # print (topics)
783 783 try:
784 784 msg = self.session.unpack_message(msg, content=True)
785 785 except Exception:
786 786 self.log.error("iopub::invalid IOPub message", exc_info=True)
787 787 return
788 788
789 789 parent = msg['parent_header']
790 790 if not parent:
791 791 self.log.error("iopub::invalid IOPub message: %r"%msg)
792 792 return
793 793 msg_id = parent['msg_id']
794 794 msg_type = msg['msg_type']
795 795 content = msg['content']
796 796
797 797 # ensure msg_id is in db
798 798 try:
799 799 rec = self.db.get_record(msg_id)
800 800 except KeyError:
801 801 rec = empty_record()
802 802 rec['msg_id'] = msg_id
803 803 self.db.add_record(msg_id, rec)
804 804 # stream
805 805 d = {}
806 806 if msg_type == 'stream':
807 807 name = content['name']
808 808 s = rec[name] or ''
809 809 d[name] = s + content['data']
810 810
811 811 elif msg_type == 'pyerr':
812 812 d['pyerr'] = content
813 813 elif msg_type == 'pyin':
814 814 d['pyin'] = content['code']
815 815 else:
816 816 d[msg_type] = content.get('data', '')
817 817
818 818 try:
819 819 self.db.update_record(msg_id, d)
820 820 except Exception:
821 821 self.log.error("DB Error saving iopub message %r"%msg_id, exc_info=True)
822 822
823 823
824 824
825 825 #-------------------------------------------------------------------------
826 826 # Registration requests
827 827 #-------------------------------------------------------------------------
828 828
829 829 def connection_request(self, client_id, msg):
830 830 """Reply with connection addresses for clients."""
831 831 self.log.info("client::client %r connected"%client_id)
832 832 content = dict(status='ok')
833 833 content.update(self.client_info)
834 834 jsonable = {}
835 835 for k,v in self.keytable.iteritems():
836 836 if v not in self.dead_engines:
837 jsonable[str(k)] = v.decode()
837 jsonable[str(k)] = v.decode('ascii')
838 838 content['engines'] = jsonable
839 839 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
840 840
841 841 def register_engine(self, reg, msg):
842 842 """Register a new engine."""
843 843 content = msg['content']
844 844 try:
845 845 queue = util.ensure_bytes(content['queue'])
846 846 except KeyError:
847 847 self.log.error("registration::queue not specified", exc_info=True)
848 848 return
849 849 heart = content.get('heartbeat', None)
850 850 if heart:
851 851 heart = util.ensure_bytes(heart)
852 852 """register a new engine, and create the socket(s) necessary"""
853 853 eid = self._next_id
854 854 # print (eid, queue, reg, heart)
855 855
856 856 self.log.debug("registration::register_engine(%i, %r, %r, %r)"%(eid, queue, reg, heart))
857 857
858 858 content = dict(id=eid,status='ok')
859 859 content.update(self.engine_info)
860 860 # check if requesting available IDs:
861 861 if queue in self.by_ident:
862 862 try:
863 863 raise KeyError("queue_id %r in use"%queue)
864 864 except:
865 865 content = error.wrap_exception()
866 866 self.log.error("queue_id %r in use"%queue, exc_info=True)
867 867 elif heart in self.hearts: # need to check unique hearts?
868 868 try:
869 869 raise KeyError("heart_id %r in use"%heart)
870 870 except:
871 871 self.log.error("heart_id %r in use"%heart, exc_info=True)
872 872 content = error.wrap_exception()
873 873 else:
874 874 for h, pack in self.incoming_registrations.iteritems():
875 875 if heart == h:
876 876 try:
877 877 raise KeyError("heart_id %r in use"%heart)
878 878 except:
879 879 self.log.error("heart_id %r in use"%heart, exc_info=True)
880 880 content = error.wrap_exception()
881 881 break
882 882 elif queue == pack[1]:
883 883 try:
884 884 raise KeyError("queue_id %r in use"%queue)
885 885 except:
886 886 self.log.error("queue_id %r in use"%queue, exc_info=True)
887 887 content = error.wrap_exception()
888 888 break
889 889
890 890 msg = self.session.send(self.query, "registration_reply",
891 891 content=content,
892 892 ident=reg)
893 893
894 894 if content['status'] == 'ok':
895 895 if heart in self.heartmonitor.hearts:
896 896 # already beating
897 897 self.incoming_registrations[heart] = (eid,queue,reg[0],None)
898 898 self.finish_registration(heart)
899 899 else:
900 900 purge = lambda : self._purge_stalled_registration(heart)
901 901 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
902 902 dc.start()
903 903 self.incoming_registrations[heart] = (eid,queue,reg[0],dc)
904 904 else:
905 905 self.log.error("registration::registration %i failed: %r"%(eid, content['evalue']))
906 906 return eid
907 907
908 908 def unregister_engine(self, ident, msg):
909 909 """Unregister an engine that explicitly requested to leave."""
910 910 try:
911 911 eid = msg['content']['id']
912 912 except:
913 913 self.log.error("registration::bad engine id for unregistration: %r"%ident, exc_info=True)
914 914 return
915 915 self.log.info("registration::unregister_engine(%r)"%eid)
916 916 # print (eid)
917 917 uuid = self.keytable[eid]
918 918 content=dict(id=eid, queue=uuid.decode())
919 919 self.dead_engines.add(uuid)
920 920 # self.ids.remove(eid)
921 921 # uuid = self.keytable.pop(eid)
922 922 #
923 923 # ec = self.engines.pop(eid)
924 924 # self.hearts.pop(ec.heartbeat)
925 925 # self.by_ident.pop(ec.queue)
926 926 # self.completed.pop(eid)
927 927 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
928 928 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
929 929 dc.start()
930 930 ############## TODO: HANDLE IT ################
931 931
932 932 if self.notifier:
933 933 self.session.send(self.notifier, "unregistration_notification", content=content)
934 934
935 935 def _handle_stranded_msgs(self, eid, uuid):
936 936 """Handle messages known to be on an engine when the engine unregisters.
937 937
938 938 It is possible that this will fire prematurely - that is, an engine will
939 939 go down after completing a result, and the client will be notified
940 940 that the result failed and later receive the actual result.
941 941 """
942 942
943 943 outstanding = self.queues[eid]
944 944
945 945 for msg_id in outstanding:
946 946 self.pending.remove(msg_id)
947 947 self.all_completed.add(msg_id)
948 948 try:
949 949 raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id))
950 950 except:
951 951 content = error.wrap_exception()
952 952 # build a fake header:
953 953 header = {}
954 954 header['engine'] = uuid
955 955 header['date'] = datetime.now()
956 956 rec = dict(result_content=content, result_header=header, result_buffers=[])
957 957 rec['completed'] = header['date']
958 958 rec['engine_uuid'] = uuid
959 959 try:
960 960 self.db.update_record(msg_id, rec)
961 961 except Exception:
962 962 self.log.error("DB Error handling stranded msg %r"%msg_id, exc_info=True)
963 963
964 964
965 965 def finish_registration(self, heart):
966 966 """Second half of engine registration, called after our HeartMonitor
967 967 has received a beat from the Engine's Heart."""
968 968 try:
969 969 (eid,queue,reg,purge) = self.incoming_registrations.pop(heart)
970 970 except KeyError:
971 971 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
972 972 return
973 973 self.log.info("registration::finished registering engine %i:%r"%(eid,queue))
974 974 if purge is not None:
975 975 purge.stop()
976 976 control = queue
977 977 self.ids.add(eid)
978 978 self.keytable[eid] = queue
979 979 self.engines[eid] = EngineConnector(id=eid, queue=queue, registration=reg,
980 980 control=control, heartbeat=heart)
981 981 self.by_ident[queue] = eid
982 982 self.queues[eid] = list()
983 983 self.tasks[eid] = list()
984 984 self.completed[eid] = list()
985 985 self.hearts[heart] = eid
986 986 content = dict(id=eid, queue=self.engines[eid].queue.decode())
987 987 if self.notifier:
988 988 self.session.send(self.notifier, "registration_notification", content=content)
989 989 self.log.info("engine::Engine Connected: %i"%eid)
990 990
991 991 def _purge_stalled_registration(self, heart):
992 992 if heart in self.incoming_registrations:
993 993 eid = self.incoming_registrations.pop(heart)[0]
994 994 self.log.info("registration::purging stalled registration: %i"%eid)
995 995 else:
996 996 pass
997 997
998 998 #-------------------------------------------------------------------------
999 999 # Client Requests
1000 1000 #-------------------------------------------------------------------------
1001 1001
1002 1002 def shutdown_request(self, client_id, msg):
1003 1003 """handle shutdown request."""
1004 1004 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1005 1005 # also notify other clients of shutdown
1006 1006 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1007 1007 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1008 1008 dc.start()
1009 1009
1010 1010 def _shutdown(self):
1011 1011 self.log.info("hub::hub shutting down.")
1012 1012 time.sleep(0.1)
1013 1013 sys.exit(0)
1014 1014
1015 1015
1016 1016 def check_load(self, client_id, msg):
1017 1017 content = msg['content']
1018 1018 try:
1019 1019 targets = content['targets']
1020 1020 targets = self._validate_targets(targets)
1021 1021 except:
1022 1022 content = error.wrap_exception()
1023 1023 self.session.send(self.query, "hub_error",
1024 1024 content=content, ident=client_id)
1025 1025 return
1026 1026
1027 1027 content = dict(status='ok')
1028 1028 # loads = {}
1029 1029 for t in targets:
1030 1030 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1031 1031 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1032 1032
1033 1033
1034 1034 def queue_status(self, client_id, msg):
1035 1035 """Return the Queue status of one or more targets.
1036 1036 if verbose: return the msg_ids
1037 1037 else: return len of each type.
1038 1038 keys: queue (pending MUX jobs)
1039 1039 tasks (pending Task jobs)
1040 1040 completed (finished jobs from both queues)"""
1041 1041 content = msg['content']
1042 1042 targets = content['targets']
1043 1043 try:
1044 1044 targets = self._validate_targets(targets)
1045 1045 except:
1046 1046 content = error.wrap_exception()
1047 1047 self.session.send(self.query, "hub_error",
1048 1048 content=content, ident=client_id)
1049 1049 return
1050 1050 verbose = content.get('verbose', False)
1051 1051 content = dict(status='ok')
1052 1052 for t in targets:
1053 1053 queue = self.queues[t]
1054 1054 completed = self.completed[t]
1055 1055 tasks = self.tasks[t]
1056 1056 if not verbose:
1057 1057 queue = len(queue)
1058 1058 completed = len(completed)
1059 1059 tasks = len(tasks)
1060 1060 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1061 1061 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1062 1062 # print (content)
1063 1063 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1064 1064
1065 1065 def purge_results(self, client_id, msg):
1066 1066 """Purge results from memory. This method is more valuable before we move
1067 1067 to a DB based message storage mechanism."""
1068 1068 content = msg['content']
1069 1069 self.log.info("Dropping records with %s", content)
1070 1070 msg_ids = content.get('msg_ids', [])
1071 1071 reply = dict(status='ok')
1072 1072 if msg_ids == 'all':
1073 1073 try:
1074 1074 self.db.drop_matching_records(dict(completed={'$ne':None}))
1075 1075 except Exception:
1076 1076 reply = error.wrap_exception()
1077 1077 else:
1078 1078 pending = filter(lambda m: m in self.pending, msg_ids)
1079 1079 if pending:
1080 1080 try:
1081 1081 raise IndexError("msg pending: %r"%pending[0])
1082 1082 except:
1083 1083 reply = error.wrap_exception()
1084 1084 else:
1085 1085 try:
1086 1086 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1087 1087 except Exception:
1088 1088 reply = error.wrap_exception()
1089 1089
1090 1090 if reply['status'] == 'ok':
1091 1091 eids = content.get('engine_ids', [])
1092 1092 for eid in eids:
1093 1093 if eid not in self.engines:
1094 1094 try:
1095 1095 raise IndexError("No such engine: %i"%eid)
1096 1096 except:
1097 1097 reply = error.wrap_exception()
1098 1098 break
1099 1099 uid = self.engines[eid].queue
1100 1100 try:
1101 1101 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1102 1102 except Exception:
1103 1103 reply = error.wrap_exception()
1104 1104 break
1105 1105
1106 1106 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1107 1107
1108 1108 def resubmit_task(self, client_id, msg):
1109 1109 """Resubmit one or more tasks."""
1110 1110 def finish(reply):
1111 1111 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1112 1112
1113 1113 content = msg['content']
1114 1114 msg_ids = content['msg_ids']
1115 1115 reply = dict(status='ok')
1116 1116 try:
1117 1117 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1118 1118 'header', 'content', 'buffers'])
1119 1119 except Exception:
1120 1120 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1121 1121 return finish(error.wrap_exception())
1122 1122
1123 1123 # validate msg_ids
1124 1124 found_ids = [ rec['msg_id'] for rec in records ]
1125 1125 invalid_ids = filter(lambda m: m in self.pending, found_ids)
1126 1126 if len(records) > len(msg_ids):
1127 1127 try:
1128 1128 raise RuntimeError("DB appears to be in an inconsistent state."
1129 1129 "More matching records were found than should exist")
1130 1130 except Exception:
1131 1131 return finish(error.wrap_exception())
1132 1132 elif len(records) < len(msg_ids):
1133 1133 missing = [ m for m in msg_ids if m not in found_ids ]
1134 1134 try:
1135 1135 raise KeyError("No such msg(s): %r"%missing)
1136 1136 except KeyError:
1137 1137 return finish(error.wrap_exception())
1138 1138 elif invalid_ids:
1139 1139 msg_id = invalid_ids[0]
1140 1140 try:
1141 1141 raise ValueError("Task %r appears to be inflight"%(msg_id))
1142 1142 except Exception:
1143 1143 return finish(error.wrap_exception())
1144 1144
1145 1145 # clear the existing records
1146 1146 now = datetime.now()
1147 1147 rec = empty_record()
1148 1148 map(rec.pop, ['msg_id', 'header', 'content', 'buffers', 'submitted'])
1149 1149 rec['resubmitted'] = now
1150 1150 rec['queue'] = 'task'
1151 1151 rec['client_uuid'] = client_id[0]
1152 1152 try:
1153 1153 for msg_id in msg_ids:
1154 1154 self.all_completed.discard(msg_id)
1155 1155 self.db.update_record(msg_id, rec)
1156 1156 except Exception:
1157 1157 self.log.error('db::db error upating record', exc_info=True)
1158 1158 reply = error.wrap_exception()
1159 1159 else:
1160 1160 # send the messages
1161 1161 for rec in records:
1162 1162 header = rec['header']
1163 1163 # include resubmitted in header to prevent digest collision
1164 1164 header['resubmitted'] = now
1165 1165 msg = self.session.msg(header['msg_type'])
1166 1166 msg['content'] = rec['content']
1167 1167 msg['header'] = header
1168 1168 msg['msg_id'] = rec['msg_id']
1169 1169 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1170 1170
1171 1171 finish(dict(status='ok'))
1172 1172
1173 1173
1174 1174 def _extract_record(self, rec):
1175 1175 """decompose a TaskRecord dict into subsection of reply for get_result"""
1176 1176 io_dict = {}
1177 1177 for key in 'pyin pyout pyerr stdout stderr'.split():
1178 1178 io_dict[key] = rec[key]
1179 1179 content = { 'result_content': rec['result_content'],
1180 1180 'header': rec['header'],
1181 1181 'result_header' : rec['result_header'],
1182 1182 'io' : io_dict,
1183 1183 }
1184 1184 if rec['result_buffers']:
1185 1185 buffers = map(bytes, rec['result_buffers'])
1186 1186 else:
1187 1187 buffers = []
1188 1188
1189 1189 return content, buffers
1190 1190
1191 1191 def get_results(self, client_id, msg):
1192 1192 """Get the result of 1 or more messages."""
1193 1193 content = msg['content']
1194 1194 msg_ids = sorted(set(content['msg_ids']))
1195 1195 statusonly = content.get('status_only', False)
1196 1196 pending = []
1197 1197 completed = []
1198 1198 content = dict(status='ok')
1199 1199 content['pending'] = pending
1200 1200 content['completed'] = completed
1201 1201 buffers = []
1202 1202 if not statusonly:
1203 1203 try:
1204 1204 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1205 1205 # turn match list into dict, for faster lookup
1206 1206 records = {}
1207 1207 for rec in matches:
1208 1208 records[rec['msg_id']] = rec
1209 1209 except Exception:
1210 1210 content = error.wrap_exception()
1211 1211 self.session.send(self.query, "result_reply", content=content,
1212 1212 parent=msg, ident=client_id)
1213 1213 return
1214 1214 else:
1215 1215 records = {}
1216 1216 for msg_id in msg_ids:
1217 1217 if msg_id in self.pending:
1218 1218 pending.append(msg_id)
1219 1219 elif msg_id in self.all_completed:
1220 1220 completed.append(msg_id)
1221 1221 if not statusonly:
1222 1222 c,bufs = self._extract_record(records[msg_id])
1223 1223 content[msg_id] = c
1224 1224 buffers.extend(bufs)
1225 1225 elif msg_id in records:
1226 1226 if rec['completed']:
1227 1227 completed.append(msg_id)
1228 1228 c,bufs = self._extract_record(records[msg_id])
1229 1229 content[msg_id] = c
1230 1230 buffers.extend(bufs)
1231 1231 else:
1232 1232 pending.append(msg_id)
1233 1233 else:
1234 1234 try:
1235 1235 raise KeyError('No such message: '+msg_id)
1236 1236 except:
1237 1237 content = error.wrap_exception()
1238 1238 break
1239 1239 self.session.send(self.query, "result_reply", content=content,
1240 1240 parent=msg, ident=client_id,
1241 1241 buffers=buffers)
1242 1242
1243 1243 def get_history(self, client_id, msg):
1244 1244 """Get a list of all msg_ids in our DB records"""
1245 1245 try:
1246 1246 msg_ids = self.db.get_history()
1247 1247 except Exception as e:
1248 1248 content = error.wrap_exception()
1249 1249 else:
1250 1250 content = dict(status='ok', history=msg_ids)
1251 1251
1252 1252 self.session.send(self.query, "history_reply", content=content,
1253 1253 parent=msg, ident=client_id)
1254 1254
1255 1255 def db_query(self, client_id, msg):
1256 1256 """Perform a raw query on the task record database."""
1257 1257 content = msg['content']
1258 1258 query = content.get('query', {})
1259 1259 keys = content.get('keys', None)
1260 1260 buffers = []
1261 1261 empty = list()
1262 1262 try:
1263 1263 records = self.db.find_records(query, keys)
1264 1264 except Exception as e:
1265 1265 content = error.wrap_exception()
1266 1266 else:
1267 1267 # extract buffers from reply content:
1268 1268 if keys is not None:
1269 1269 buffer_lens = [] if 'buffers' in keys else None
1270 1270 result_buffer_lens = [] if 'result_buffers' in keys else None
1271 1271 else:
1272 1272 buffer_lens = []
1273 1273 result_buffer_lens = []
1274 1274
1275 1275 for rec in records:
1276 1276 # buffers may be None, so double check
1277 1277 if buffer_lens is not None:
1278 1278 b = rec.pop('buffers', empty) or empty
1279 1279 buffer_lens.append(len(b))
1280 1280 buffers.extend(b)
1281 1281 if result_buffer_lens is not None:
1282 1282 rb = rec.pop('result_buffers', empty) or empty
1283 1283 result_buffer_lens.append(len(rb))
1284 1284 buffers.extend(rb)
1285 1285 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1286 1286 result_buffer_lens=result_buffer_lens)
1287 1287 # self.log.debug (content)
1288 1288 self.session.send(self.query, "db_reply", content=content,
1289 1289 parent=msg, ident=client_id,
1290 1290 buffers=buffers)
1291 1291
@@ -1,714 +1,714
1 1 """The Python scheduler for rich scheduling.
2 2
3 3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
4 4 nor does it check msg_id DAG dependencies. For those, a slightly slower
5 5 Python Scheduler exists.
6 6
7 7 Authors:
8 8
9 9 * Min RK
10 10 """
11 11 #-----------------------------------------------------------------------------
12 12 # Copyright (C) 2010-2011 The IPython Development Team
13 13 #
14 14 # Distributed under the terms of the BSD License. The full license is in
15 15 # the file COPYING, distributed as part of this software.
16 16 #-----------------------------------------------------------------------------
17 17
18 18 #----------------------------------------------------------------------
19 19 # Imports
20 20 #----------------------------------------------------------------------
21 21
22 22 from __future__ import print_function
23 23
24 24 import logging
25 25 import sys
26 26
27 27 from datetime import datetime, timedelta
28 28 from random import randint, random
29 29 from types import FunctionType
30 30
31 31 try:
32 32 import numpy
33 33 except ImportError:
34 34 numpy = None
35 35
36 36 import zmq
37 37 from zmq.eventloop import ioloop, zmqstream
38 38
39 39 # local imports
40 40 from IPython.external.decorator import decorator
41 41 from IPython.config.application import Application
42 42 from IPython.config.loader import Config
43 43 from IPython.utils.traitlets import Instance, Dict, List, Set, Int, Enum, CBytes
44 44
45 45 from IPython.parallel import error
46 46 from IPython.parallel.factory import SessionFactory
47 47 from IPython.parallel.util import connect_logger, local_logger, ensure_bytes
48 48
49 49 from .dependency import Dependency
50 50
51 51 @decorator
52 52 def logged(f,self,*args,**kwargs):
53 53 # print ("#--------------------")
54 54 self.log.debug("scheduler::%s(*%s,**%s)", f.func_name, args, kwargs)
55 55 # print ("#--")
56 56 return f(self,*args, **kwargs)
57 57
58 58 #----------------------------------------------------------------------
59 59 # Chooser functions
60 60 #----------------------------------------------------------------------
61 61
62 62 def plainrandom(loads):
63 63 """Plain random pick."""
64 64 n = len(loads)
65 65 return randint(0,n-1)
66 66
67 67 def lru(loads):
68 68 """Always pick the front of the line.
69 69
70 70 The content of `loads` is ignored.
71 71
72 72 Assumes LRU ordering of loads, with oldest first.
73 73 """
74 74 return 0
75 75
76 76 def twobin(loads):
77 77 """Pick two at random, use the LRU of the two.
78 78
79 79 The content of loads is ignored.
80 80
81 81 Assumes LRU ordering of loads, with oldest first.
82 82 """
83 83 n = len(loads)
84 84 a = randint(0,n-1)
85 85 b = randint(0,n-1)
86 86 return min(a,b)
87 87
88 88 def weighted(loads):
89 89 """Pick two at random using inverse load as weight.
90 90
91 91 Return the less loaded of the two.
92 92 """
93 93 # weight 0 a million times more than 1:
94 94 weights = 1./(1e-6+numpy.array(loads))
95 95 sums = weights.cumsum()
96 96 t = sums[-1]
97 97 x = random()*t
98 98 y = random()*t
99 99 idx = 0
100 100 idy = 0
101 101 while sums[idx] < x:
102 102 idx += 1
103 103 while sums[idy] < y:
104 104 idy += 1
105 105 if weights[idy] > weights[idx]:
106 106 return idy
107 107 else:
108 108 return idx
109 109
110 110 def leastload(loads):
111 111 """Always choose the lowest load.
112 112
113 113 If the lowest load occurs more than once, the first
114 114 occurance will be used. If loads has LRU ordering, this means
115 115 the LRU of those with the lowest load is chosen.
116 116 """
117 117 return loads.index(min(loads))
118 118
119 119 #---------------------------------------------------------------------
120 120 # Classes
121 121 #---------------------------------------------------------------------
122 122 # store empty default dependency:
123 123 MET = Dependency([])
124 124
125 125 class TaskScheduler(SessionFactory):
126 126 """Python TaskScheduler object.
127 127
128 128 This is the simplest object that supports msg_id based
129 129 DAG dependencies. *Only* task msg_ids are checked, not
130 130 msg_ids of jobs submitted via the MUX queue.
131 131
132 132 """
133 133
134 134 hwm = Int(0, config=True, shortname='hwm',
135 135 help="""specify the High Water Mark (HWM) for the downstream
136 136 socket in the Task scheduler. This is the maximum number
137 137 of allowed outstanding tasks on each engine."""
138 138 )
139 139 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
140 140 'leastload', config=True, shortname='scheme', allow_none=False,
141 141 help="""select the task scheduler scheme [default: Python LRU]
142 142 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
143 143 )
144 144 def _scheme_name_changed(self, old, new):
145 145 self.log.debug("Using scheme %r"%new)
146 146 self.scheme = globals()[new]
147 147
148 148 # input arguments:
149 149 scheme = Instance(FunctionType) # function for determining the destination
150 150 def _scheme_default(self):
151 151 return leastload
152 152 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
153 153 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
154 154 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
155 155 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
156 156
157 157 # internals:
158 158 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
159 159 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
160 160 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
161 161 depending = Dict() # dict by msg_id of (msg_id, raw_msg, after, follow)
162 162 pending = Dict() # dict by engine_uuid of submitted tasks
163 163 completed = Dict() # dict by engine_uuid of completed tasks
164 164 failed = Dict() # dict by engine_uuid of failed tasks
165 165 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
166 166 clients = Dict() # dict by msg_id for who submitted the task
167 167 targets = List() # list of target IDENTs
168 168 loads = List() # list of engine loads
169 169 # full = Set() # set of IDENTs that have HWM outstanding tasks
170 170 all_completed = Set() # set of all completed tasks
171 171 all_failed = Set() # set of all failed tasks
172 172 all_done = Set() # set of all finished tasks=union(completed,failed)
173 173 all_ids = Set() # set of all submitted task IDs
174 174 blacklist = Dict() # dict by msg_id of locations where a job has encountered UnmetDependency
175 175 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
176 176
177 177 ident = CBytes() # ZMQ identity. This should just be self.session.session
178 178 # but ensure Bytes
179 179 def _ident_default(self):
180 180 return ensure_bytes(self.session.session)
181 181
182 182 def start(self):
183 183 self.engine_stream.on_recv(self.dispatch_result, copy=False)
184 184 self._notification_handlers = dict(
185 185 registration_notification = self._register_engine,
186 186 unregistration_notification = self._unregister_engine
187 187 )
188 188 self.notifier_stream.on_recv(self.dispatch_notification)
189 189 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
190 190 self.auditor.start()
191 191 self.log.info("Scheduler started [%s]"%self.scheme_name)
192 192
193 193 def resume_receiving(self):
194 194 """Resume accepting jobs."""
195 195 self.client_stream.on_recv(self.dispatch_submission, copy=False)
196 196
197 197 def stop_receiving(self):
198 198 """Stop accepting jobs while there are no engines.
199 199 Leave them in the ZMQ queue."""
200 200 self.client_stream.on_recv(None)
201 201
202 202 #-----------------------------------------------------------------------
203 203 # [Un]Registration Handling
204 204 #-----------------------------------------------------------------------
205 205
206 206 def dispatch_notification(self, msg):
207 207 """dispatch register/unregister events."""
208 208 try:
209 209 idents,msg = self.session.feed_identities(msg)
210 210 except ValueError:
211 211 self.log.warn("task::Invalid Message: %r",msg)
212 212 return
213 213 try:
214 214 msg = self.session.unpack_message(msg)
215 215 except ValueError:
216 216 self.log.warn("task::Unauthorized message from: %r"%idents)
217 217 return
218 218
219 219 msg_type = msg['msg_type']
220 220
221 221 handler = self._notification_handlers.get(msg_type, None)
222 222 if handler is None:
223 223 self.log.error("Unhandled message type: %r"%msg_type)
224 224 else:
225 225 try:
226 226 handler(ensure_bytes(msg['content']['queue']))
227 227 except Exception:
228 228 self.log.error("task::Invalid notification msg: %r",msg)
229 229
230 230 def _register_engine(self, uid):
231 231 """New engine with ident `uid` became available."""
232 232 # head of the line:
233 233 self.targets.insert(0,uid)
234 234 self.loads.insert(0,0)
235 235
236 236 # initialize sets
237 237 self.completed[uid] = set()
238 238 self.failed[uid] = set()
239 239 self.pending[uid] = {}
240 240 if len(self.targets) == 1:
241 241 self.resume_receiving()
242 242 # rescan the graph:
243 243 self.update_graph(None)
244 244
245 245 def _unregister_engine(self, uid):
246 246 """Existing engine with ident `uid` became unavailable."""
247 247 if len(self.targets) == 1:
248 248 # this was our only engine
249 249 self.stop_receiving()
250 250
251 251 # handle any potentially finished tasks:
252 252 self.engine_stream.flush()
253 253
254 254 # don't pop destinations, because they might be used later
255 255 # map(self.destinations.pop, self.completed.pop(uid))
256 256 # map(self.destinations.pop, self.failed.pop(uid))
257 257
258 258 # prevent this engine from receiving work
259 259 idx = self.targets.index(uid)
260 260 self.targets.pop(idx)
261 261 self.loads.pop(idx)
262 262
263 263 # wait 5 seconds before cleaning up pending jobs, since the results might
264 264 # still be incoming
265 265 if self.pending[uid]:
266 266 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
267 267 dc.start()
268 268 else:
269 269 self.completed.pop(uid)
270 270 self.failed.pop(uid)
271 271
272 272
273 273 def handle_stranded_tasks(self, engine):
274 274 """Deal with jobs resident in an engine that died."""
275 275 lost = self.pending[engine]
276 276 for msg_id in lost.keys():
277 277 if msg_id not in self.pending[engine]:
278 278 # prevent double-handling of messages
279 279 continue
280 280
281 281 raw_msg = lost[msg_id][0]
282 282 idents,msg = self.session.feed_identities(raw_msg, copy=False)
283 283 parent = self.session.unpack(msg[1].bytes)
284 284 idents = [engine, idents[0]]
285 285
286 286 # build fake error reply
287 287 try:
288 288 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
289 289 except:
290 290 content = error.wrap_exception()
291 291 msg = self.session.msg('apply_reply', content, parent=parent, subheader={'status':'error'})
292 292 raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
293 293 # and dispatch it
294 294 self.dispatch_result(raw_reply)
295 295
296 296 # finally scrub completed/failed lists
297 297 self.completed.pop(engine)
298 298 self.failed.pop(engine)
299 299
300 300
301 301 #-----------------------------------------------------------------------
302 302 # Job Submission
303 303 #-----------------------------------------------------------------------
304 304 def dispatch_submission(self, raw_msg):
305 305 """Dispatch job submission to appropriate handlers."""
306 306 # ensure targets up to date:
307 307 self.notifier_stream.flush()
308 308 try:
309 309 idents, msg = self.session.feed_identities(raw_msg, copy=False)
310 310 msg = self.session.unpack_message(msg, content=False, copy=False)
311 311 except Exception:
312 312 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
313 313 return
314 314
315 315
316 316 # send to monitor
317 317 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
318 318
319 319 header = msg['header']
320 320 msg_id = header['msg_id']
321 321 self.all_ids.add(msg_id)
322 322
323 323 # get targets as a set of bytes objects
324 324 # from a list of unicode objects
325 325 targets = header.get('targets', [])
326 326 targets = map(ensure_bytes, targets)
327 327 targets = set(targets)
328 328
329 329 retries = header.get('retries', 0)
330 330 self.retries[msg_id] = retries
331 331
332 332 # time dependencies
333 333 after = header.get('after', None)
334 334 if after:
335 335 after = Dependency(after)
336 336 if after.all:
337 337 if after.success:
338 338 after = Dependency(after.difference(self.all_completed),
339 339 success=after.success,
340 340 failure=after.failure,
341 341 all=after.all,
342 342 )
343 343 if after.failure:
344 344 after = Dependency(after.difference(self.all_failed),
345 345 success=after.success,
346 346 failure=after.failure,
347 347 all=after.all,
348 348 )
349 349 if after.check(self.all_completed, self.all_failed):
350 350 # recast as empty set, if `after` already met,
351 351 # to prevent unnecessary set comparisons
352 352 after = MET
353 353 else:
354 354 after = MET
355 355
356 356 # location dependencies
357 357 follow = Dependency(header.get('follow', []))
358 358
359 359 # turn timeouts into datetime objects:
360 360 timeout = header.get('timeout', None)
361 361 if timeout:
362 362 timeout = datetime.now() + timedelta(0,timeout,0)
363 363
364 364 args = [raw_msg, targets, after, follow, timeout]
365 365
366 366 # validate and reduce dependencies:
367 367 for dep in after,follow:
368 368 if not dep: # empty dependency
369 369 continue
370 370 # check valid:
371 371 if msg_id in dep or dep.difference(self.all_ids):
372 372 self.depending[msg_id] = args
373 373 return self.fail_unreachable(msg_id, error.InvalidDependency)
374 374 # check if unreachable:
375 375 if dep.unreachable(self.all_completed, self.all_failed):
376 376 self.depending[msg_id] = args
377 377 return self.fail_unreachable(msg_id)
378 378
379 379 if after.check(self.all_completed, self.all_failed):
380 380 # time deps already met, try to run
381 381 if not self.maybe_run(msg_id, *args):
382 382 # can't run yet
383 383 if msg_id not in self.all_failed:
384 384 # could have failed as unreachable
385 385 self.save_unmet(msg_id, *args)
386 386 else:
387 387 self.save_unmet(msg_id, *args)
388 388
389 389 def audit_timeouts(self):
390 390 """Audit all waiting tasks for expired timeouts."""
391 391 now = datetime.now()
392 392 for msg_id in self.depending.keys():
393 393 # must recheck, in case one failure cascaded to another:
394 394 if msg_id in self.depending:
395 395 raw,after,targets,follow,timeout = self.depending[msg_id]
396 396 if timeout and timeout < now:
397 397 self.fail_unreachable(msg_id, error.TaskTimeout)
398 398
399 399 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
400 400 """a task has become unreachable, send a reply with an ImpossibleDependency
401 401 error."""
402 402 if msg_id not in self.depending:
403 403 self.log.error("msg %r already failed!", msg_id)
404 404 return
405 405 raw_msg,targets,after,follow,timeout = self.depending.pop(msg_id)
406 406 for mid in follow.union(after):
407 407 if mid in self.graph:
408 408 self.graph[mid].remove(msg_id)
409 409
410 410 # FIXME: unpacking a message I've already unpacked, but didn't save:
411 411 idents,msg = self.session.feed_identities(raw_msg, copy=False)
412 412 header = self.session.unpack(msg[1].bytes)
413 413
414 414 try:
415 415 raise why()
416 416 except:
417 417 content = error.wrap_exception()
418 418
419 419 self.all_done.add(msg_id)
420 420 self.all_failed.add(msg_id)
421 421
422 422 msg = self.session.send(self.client_stream, 'apply_reply', content,
423 423 parent=header, ident=idents)
424 424 self.session.send(self.mon_stream, msg, ident=[b'outtask']+idents)
425 425
426 426 self.update_graph(msg_id, success=False)
427 427
428 428 def maybe_run(self, msg_id, raw_msg, targets, after, follow, timeout):
429 429 """check location dependencies, and run if they are met."""
430 430 blacklist = self.blacklist.setdefault(msg_id, set())
431 431 if follow or targets or blacklist or self.hwm:
432 432 # we need a can_run filter
433 433 def can_run(idx):
434 434 # check hwm
435 435 if self.hwm and self.loads[idx] == self.hwm:
436 436 return False
437 437 target = self.targets[idx]
438 438 # check blacklist
439 439 if target in blacklist:
440 440 return False
441 441 # check targets
442 442 if targets and target not in targets:
443 443 return False
444 444 # check follow
445 445 return follow.check(self.completed[target], self.failed[target])
446 446
447 447 indices = filter(can_run, range(len(self.targets)))
448 448
449 449 if not indices:
450 450 # couldn't run
451 451 if follow.all:
452 452 # check follow for impossibility
453 453 dests = set()
454 454 relevant = set()
455 455 if follow.success:
456 456 relevant = self.all_completed
457 457 if follow.failure:
458 458 relevant = relevant.union(self.all_failed)
459 459 for m in follow.intersection(relevant):
460 460 dests.add(self.destinations[m])
461 461 if len(dests) > 1:
462 462 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
463 463 self.fail_unreachable(msg_id)
464 464 return False
465 465 if targets:
466 466 # check blacklist+targets for impossibility
467 467 targets.difference_update(blacklist)
468 468 if not targets or not targets.intersection(self.targets):
469 469 self.depending[msg_id] = (raw_msg, targets, after, follow, timeout)
470 470 self.fail_unreachable(msg_id)
471 471 return False
472 472 return False
473 473 else:
474 474 indices = None
475 475
476 476 self.submit_task(msg_id, raw_msg, targets, follow, timeout, indices)
477 477 return True
478 478
479 479 def save_unmet(self, msg_id, raw_msg, targets, after, follow, timeout):
480 480 """Save a message for later submission when its dependencies are met."""
481 481 self.depending[msg_id] = [raw_msg,targets,after,follow,timeout]
482 482 # track the ids in follow or after, but not those already finished
483 483 for dep_id in after.union(follow).difference(self.all_done):
484 484 if dep_id not in self.graph:
485 485 self.graph[dep_id] = set()
486 486 self.graph[dep_id].add(msg_id)
487 487
488 488 def submit_task(self, msg_id, raw_msg, targets, follow, timeout, indices=None):
489 489 """Submit a task to any of a subset of our targets."""
490 490 if indices:
491 491 loads = [self.loads[i] for i in indices]
492 492 else:
493 493 loads = self.loads
494 494 idx = self.scheme(loads)
495 495 if indices:
496 496 idx = indices[idx]
497 497 target = self.targets[idx]
498 498 # print (target, map(str, msg[:3]))
499 499 # send job to the engine
500 500 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
501 501 self.engine_stream.send_multipart(raw_msg, copy=False)
502 502 # update load
503 503 self.add_job(idx)
504 504 self.pending[target][msg_id] = (raw_msg, targets, MET, follow, timeout)
505 505 # notify Hub
506 content = dict(msg_id=msg_id, engine_id=target.decode())
506 content = dict(msg_id=msg_id, engine_id=target.decode('ascii'))
507 507 self.session.send(self.mon_stream, 'task_destination', content=content,
508 508 ident=[b'tracktask',self.ident])
509 509
510 510
511 511 #-----------------------------------------------------------------------
512 512 # Result Handling
513 513 #-----------------------------------------------------------------------
514 514 def dispatch_result(self, raw_msg):
515 515 """dispatch method for result replies"""
516 516 try:
517 517 idents,msg = self.session.feed_identities(raw_msg, copy=False)
518 518 msg = self.session.unpack_message(msg, content=False, copy=False)
519 519 engine = idents[0]
520 520 try:
521 521 idx = self.targets.index(engine)
522 522 except ValueError:
523 523 pass # skip load-update for dead engines
524 524 else:
525 525 self.finish_job(idx)
526 526 except Exception:
527 527 self.log.error("task::Invaid result: %r", raw_msg, exc_info=True)
528 528 return
529 529
530 530 header = msg['header']
531 531 parent = msg['parent_header']
532 532 if header.get('dependencies_met', True):
533 533 success = (header['status'] == 'ok')
534 534 msg_id = parent['msg_id']
535 535 retries = self.retries[msg_id]
536 536 if not success and retries > 0:
537 537 # failed
538 538 self.retries[msg_id] = retries - 1
539 539 self.handle_unmet_dependency(idents, parent)
540 540 else:
541 541 del self.retries[msg_id]
542 542 # relay to client and update graph
543 543 self.handle_result(idents, parent, raw_msg, success)
544 544 # send to Hub monitor
545 545 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
546 546 else:
547 547 self.handle_unmet_dependency(idents, parent)
548 548
549 549 def handle_result(self, idents, parent, raw_msg, success=True):
550 550 """handle a real task result, either success or failure"""
551 551 # first, relay result to client
552 552 engine = idents[0]
553 553 client = idents[1]
554 554 # swap_ids for XREP-XREP mirror
555 555 raw_msg[:2] = [client,engine]
556 556 # print (map(str, raw_msg[:4]))
557 557 self.client_stream.send_multipart(raw_msg, copy=False)
558 558 # now, update our data structures
559 559 msg_id = parent['msg_id']
560 560 self.blacklist.pop(msg_id, None)
561 561 self.pending[engine].pop(msg_id)
562 562 if success:
563 563 self.completed[engine].add(msg_id)
564 564 self.all_completed.add(msg_id)
565 565 else:
566 566 self.failed[engine].add(msg_id)
567 567 self.all_failed.add(msg_id)
568 568 self.all_done.add(msg_id)
569 569 self.destinations[msg_id] = engine
570 570
571 571 self.update_graph(msg_id, success)
572 572
573 573 def handle_unmet_dependency(self, idents, parent):
574 574 """handle an unmet dependency"""
575 575 engine = idents[0]
576 576 msg_id = parent['msg_id']
577 577
578 578 if msg_id not in self.blacklist:
579 579 self.blacklist[msg_id] = set()
580 580 self.blacklist[msg_id].add(engine)
581 581
582 582 args = self.pending[engine].pop(msg_id)
583 583 raw,targets,after,follow,timeout = args
584 584
585 585 if self.blacklist[msg_id] == targets:
586 586 self.depending[msg_id] = args
587 587 self.fail_unreachable(msg_id)
588 588 elif not self.maybe_run(msg_id, *args):
589 589 # resubmit failed
590 590 if msg_id not in self.all_failed:
591 591 # put it back in our dependency tree
592 592 self.save_unmet(msg_id, *args)
593 593
594 594 if self.hwm:
595 595 try:
596 596 idx = self.targets.index(engine)
597 597 except ValueError:
598 598 pass # skip load-update for dead engines
599 599 else:
600 600 if self.loads[idx] == self.hwm-1:
601 601 self.update_graph(None)
602 602
603 603
604 604
605 605 def update_graph(self, dep_id=None, success=True):
606 606 """dep_id just finished. Update our dependency
607 607 graph and submit any jobs that just became runable.
608 608
609 609 Called with dep_id=None to update entire graph for hwm, but without finishing
610 610 a task.
611 611 """
612 612 # print ("\n\n***********")
613 613 # pprint (dep_id)
614 614 # pprint (self.graph)
615 615 # pprint (self.depending)
616 616 # pprint (self.all_completed)
617 617 # pprint (self.all_failed)
618 618 # print ("\n\n***********\n\n")
619 619 # update any jobs that depended on the dependency
620 620 jobs = self.graph.pop(dep_id, [])
621 621
622 622 # recheck *all* jobs if
623 623 # a) we have HWM and an engine just become no longer full
624 624 # or b) dep_id was given as None
625 625 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
626 626 jobs = self.depending.keys()
627 627
628 628 for msg_id in jobs:
629 629 raw_msg, targets, after, follow, timeout = self.depending[msg_id]
630 630
631 631 if after.unreachable(self.all_completed, self.all_failed)\
632 632 or follow.unreachable(self.all_completed, self.all_failed):
633 633 self.fail_unreachable(msg_id)
634 634
635 635 elif after.check(self.all_completed, self.all_failed): # time deps met, maybe run
636 636 if self.maybe_run(msg_id, raw_msg, targets, MET, follow, timeout):
637 637
638 638 self.depending.pop(msg_id)
639 639 for mid in follow.union(after):
640 640 if mid in self.graph:
641 641 self.graph[mid].remove(msg_id)
642 642
643 643 #----------------------------------------------------------------------
644 644 # methods to be overridden by subclasses
645 645 #----------------------------------------------------------------------
646 646
647 647 def add_job(self, idx):
648 648 """Called after self.targets[idx] just got the job with header.
649 649 Override with subclasses. The default ordering is simple LRU.
650 650 The default loads are the number of outstanding jobs."""
651 651 self.loads[idx] += 1
652 652 for lis in (self.targets, self.loads):
653 653 lis.append(lis.pop(idx))
654 654
655 655
656 656 def finish_job(self, idx):
657 657 """Called after self.targets[idx] just finished a job.
658 658 Override with subclasses."""
659 659 self.loads[idx] -= 1
660 660
661 661
662 662
663 663 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, config=None,
664 664 logname='root', log_url=None, loglevel=logging.DEBUG,
665 665 identity=b'task', in_thread=False):
666 666
667 667 ZMQStream = zmqstream.ZMQStream
668 668
669 669 if config:
670 670 # unwrap dict back into Config
671 671 config = Config(config)
672 672
673 673 if in_thread:
674 674 # use instance() to get the same Context/Loop as our parent
675 675 ctx = zmq.Context.instance()
676 676 loop = ioloop.IOLoop.instance()
677 677 else:
678 678 # in a process, don't use instance()
679 679 # for safety with multiprocessing
680 680 ctx = zmq.Context()
681 681 loop = ioloop.IOLoop()
682 682 ins = ZMQStream(ctx.socket(zmq.XREP),loop)
683 683 ins.setsockopt(zmq.IDENTITY, identity)
684 684 ins.bind(in_addr)
685 685
686 686 outs = ZMQStream(ctx.socket(zmq.XREP),loop)
687 687 outs.setsockopt(zmq.IDENTITY, identity)
688 688 outs.bind(out_addr)
689 689 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
690 690 mons.connect(mon_addr)
691 691 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
692 692 nots.setsockopt(zmq.SUBSCRIBE, b'')
693 693 nots.connect(not_addr)
694 694
695 695 # setup logging.
696 696 if in_thread:
697 697 log = Application.instance().log
698 698 else:
699 699 if log_url:
700 700 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
701 701 else:
702 702 log = local_logger(logname, loglevel)
703 703
704 704 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
705 705 mon_stream=mons, notifier_stream=nots,
706 706 loop=loop, log=log,
707 707 config=config)
708 708 scheduler.start()
709 709 if not in_thread:
710 710 try:
711 711 loop.start()
712 712 except KeyboardInterrupt:
713 713 print ("interrupted, exiting...", file=sys.__stderr__)
714 714
@@ -1,170 +1,174
1 1 #!/usr/bin/env python
2 2 """A simple engine that talks to a controller over 0MQ.
3 3 it handles registration, etc. and launches a kernel
4 4 connected to the Controller's Schedulers.
5 5
6 6 Authors:
7 7
8 8 * Min RK
9 9 """
10 10 #-----------------------------------------------------------------------------
11 11 # Copyright (C) 2010-2011 The IPython Development Team
12 12 #
13 13 # Distributed under the terms of the BSD License. The full license is in
14 14 # the file COPYING, distributed as part of this software.
15 15 #-----------------------------------------------------------------------------
16 16
17 17 from __future__ import print_function
18 18
19 19 import sys
20 20 import time
21 21
22 22 import zmq
23 23 from zmq.eventloop import ioloop, zmqstream
24 24
25 25 # internal
26 from IPython.utils.traitlets import Instance, Dict, Int, Type, CFloat, Unicode
26 from IPython.utils.traitlets import Instance, Dict, Int, Type, CFloat, Unicode, CBytes
27 27 # from IPython.utils.localinterfaces import LOCALHOST
28 28
29 29 from IPython.parallel.controller.heartmonitor import Heart
30 30 from IPython.parallel.factory import RegistrationFactory
31 31 from IPython.parallel.util import disambiguate_url, ensure_bytes
32 32
33 33 from IPython.zmq.session import Message
34 34
35 35 from .streamkernel import Kernel
36 36
37 37 class EngineFactory(RegistrationFactory):
38 38 """IPython engine"""
39 39
40 40 # configurables:
41 41 out_stream_factory=Type('IPython.zmq.iostream.OutStream', config=True,
42 42 help="""The OutStream for handling stdout/err.
43 43 Typically 'IPython.zmq.iostream.OutStream'""")
44 44 display_hook_factory=Type('IPython.zmq.displayhook.ZMQDisplayHook', config=True,
45 45 help="""The class for handling displayhook.
46 46 Typically 'IPython.zmq.displayhook.ZMQDisplayHook'""")
47 47 location=Unicode(config=True,
48 48 help="""The location (an IP address) of the controller. This is
49 49 used for disambiguating URLs, to determine whether
50 50 loopback should be used to connect or the public address.""")
51 51 timeout=CFloat(2,config=True,
52 52 help="""The time (in seconds) to wait for the Controller to respond
53 53 to registration requests before giving up.""")
54 54
55 55 # not configurable:
56 56 user_ns=Dict()
57 57 id=Int(allow_none=True)
58 58 registrar=Instance('zmq.eventloop.zmqstream.ZMQStream')
59 59 kernel=Instance(Kernel)
60 60
61 bident = CBytes()
62 ident = Unicode()
63 def _ident_changed(self, name, old, new):
64 self.bident = ensure_bytes(new)
65
61 66
62 67 def __init__(self, **kwargs):
63 68 super(EngineFactory, self).__init__(**kwargs)
64 69 self.ident = self.session.session
65 70 ctx = self.context
66 71
67 72 reg = ctx.socket(zmq.XREQ)
68 reg.setsockopt(zmq.IDENTITY, ensure_bytes(self.ident))
73 reg.setsockopt(zmq.IDENTITY, self.bident)
69 74 reg.connect(self.url)
70 75 self.registrar = zmqstream.ZMQStream(reg, self.loop)
71 76
72 77 def register(self):
73 78 """send the registration_request"""
74 79
75 80 self.log.info("Registering with controller at %s"%self.url)
76 81 content = dict(queue=self.ident, heartbeat=self.ident, control=self.ident)
77 82 self.registrar.on_recv(self.complete_registration)
78 83 # print (self.session.key)
79 84 self.session.send(self.registrar, "registration_request",content=content)
80 85
81 86 def complete_registration(self, msg):
82 87 # print msg
83 88 self._abort_dc.stop()
84 89 ctx = self.context
85 90 loop = self.loop
86 identity = ensure_bytes(self.ident)
87
91 identity = self.bident
88 92 idents,msg = self.session.feed_identities(msg)
89 93 msg = Message(self.session.unpack_message(msg))
90 94
91 95 if msg.content.status == 'ok':
92 96 self.id = int(msg.content.id)
93 97
94 98 # create Shell Streams (MUX, Task, etc.):
95 99 queue_addr = msg.content.mux
96 100 shell_addrs = [ str(queue_addr) ]
97 101 task_addr = msg.content.task
98 102 if task_addr:
99 103 shell_addrs.append(str(task_addr))
100 104
101 105 # Uncomment this to go back to two-socket model
102 106 # shell_streams = []
103 107 # for addr in shell_addrs:
104 108 # stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
105 109 # stream.setsockopt(zmq.IDENTITY, identity)
106 110 # stream.connect(disambiguate_url(addr, self.location))
107 111 # shell_streams.append(stream)
108 112
109 113 # Now use only one shell stream for mux and tasks
110 114 stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
111 115 stream.setsockopt(zmq.IDENTITY, identity)
112 116 shell_streams = [stream]
113 117 for addr in shell_addrs:
114 118 stream.connect(disambiguate_url(addr, self.location))
115 119 # end single stream-socket
116 120
117 121 # control stream:
118 122 control_addr = str(msg.content.control)
119 123 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.XREP), loop)
120 124 control_stream.setsockopt(zmq.IDENTITY, identity)
121 125 control_stream.connect(disambiguate_url(control_addr, self.location))
122 126
123 127 # create iopub stream:
124 128 iopub_addr = msg.content.iopub
125 129 iopub_stream = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop)
126 130 iopub_stream.setsockopt(zmq.IDENTITY, identity)
127 131 iopub_stream.connect(disambiguate_url(iopub_addr, self.location))
128 132
129 133 # launch heartbeat
130 134 hb_addrs = msg.content.heartbeat
131 135 # print (hb_addrs)
132 136
133 137 # # Redirect input streams and set a display hook.
134 138 if self.out_stream_factory:
135 139 sys.stdout = self.out_stream_factory(self.session, iopub_stream, u'stdout')
136 140 sys.stdout.topic = 'engine.%i.stdout'%self.id
137 141 sys.stderr = self.out_stream_factory(self.session, iopub_stream, u'stderr')
138 142 sys.stderr.topic = 'engine.%i.stderr'%self.id
139 143 if self.display_hook_factory:
140 144 sys.displayhook = self.display_hook_factory(self.session, iopub_stream)
141 145 sys.displayhook.topic = 'engine.%i.pyout'%self.id
142 146
143 147 self.kernel = Kernel(config=self.config, int_id=self.id, ident=self.ident, session=self.session,
144 148 control_stream=control_stream, shell_streams=shell_streams, iopub_stream=iopub_stream,
145 149 loop=loop, user_ns = self.user_ns, log=self.log)
146 150 self.kernel.start()
147 151 hb_addrs = [ disambiguate_url(addr, self.location) for addr in hb_addrs ]
148 152 heart = Heart(*map(str, hb_addrs), heart_id=identity)
149 153 heart.start()
150 154
151 155
152 156 else:
153 157 self.log.fatal("Registration Failed: %s"%msg)
154 158 raise Exception("Registration Failed: %s"%msg)
155 159
156 160 self.log.info("Completed registration with id %i"%self.id)
157 161
158 162
159 163 def abort(self):
160 164 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
161 165 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
162 166 time.sleep(1)
163 167 sys.exit(255)
164 168
165 169 def start(self):
166 170 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
167 171 dc.start()
168 172 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
169 173 self._abort_dc.start()
170 174
@@ -1,456 +1,456
1 1 """some generic utilities for dealing with classes, urls, and serialization
2 2
3 3 Authors:
4 4
5 5 * Min RK
6 6 """
7 7 #-----------------------------------------------------------------------------
8 8 # Copyright (C) 2010-2011 The IPython Development Team
9 9 #
10 10 # Distributed under the terms of the BSD License. The full license is in
11 11 # the file COPYING, distributed as part of this software.
12 12 #-----------------------------------------------------------------------------
13 13
14 14 #-----------------------------------------------------------------------------
15 15 # Imports
16 16 #-----------------------------------------------------------------------------
17 17
18 18 # Standard library imports.
19 19 import logging
20 20 import os
21 21 import re
22 22 import stat
23 23 import socket
24 24 import sys
25 25 from signal import signal, SIGINT, SIGABRT, SIGTERM
26 26 try:
27 27 from signal import SIGKILL
28 28 except ImportError:
29 29 SIGKILL=None
30 30
31 31 try:
32 32 import cPickle
33 33 pickle = cPickle
34 34 except:
35 35 cPickle = None
36 36 import pickle
37 37
38 38 # System library imports
39 39 import zmq
40 40 from zmq.log import handlers
41 41
42 42 # IPython imports
43 43 from IPython.utils.pickleutil import can, uncan, canSequence, uncanSequence
44 44 from IPython.utils.newserialized import serialize, unserialize
45 45 from IPython.zmq.log import EnginePUBHandler
46 46
47 47 #-----------------------------------------------------------------------------
48 48 # Classes
49 49 #-----------------------------------------------------------------------------
50 50
51 51 class Namespace(dict):
52 52 """Subclass of dict for attribute access to keys."""
53 53
54 54 def __getattr__(self, key):
55 55 """getattr aliased to getitem"""
56 56 if key in self.iterkeys():
57 57 return self[key]
58 58 else:
59 59 raise NameError(key)
60 60
61 61 def __setattr__(self, key, value):
62 62 """setattr aliased to setitem, with strict"""
63 63 if hasattr(dict, key):
64 64 raise KeyError("Cannot override dict keys %r"%key)
65 65 self[key] = value
66 66
67 67
68 68 class ReverseDict(dict):
69 69 """simple double-keyed subset of dict methods."""
70 70
71 71 def __init__(self, *args, **kwargs):
72 72 dict.__init__(self, *args, **kwargs)
73 73 self._reverse = dict()
74 74 for key, value in self.iteritems():
75 75 self._reverse[value] = key
76 76
77 77 def __getitem__(self, key):
78 78 try:
79 79 return dict.__getitem__(self, key)
80 80 except KeyError:
81 81 return self._reverse[key]
82 82
83 83 def __setitem__(self, key, value):
84 84 if key in self._reverse:
85 85 raise KeyError("Can't have key %r on both sides!"%key)
86 86 dict.__setitem__(self, key, value)
87 87 self._reverse[value] = key
88 88
89 89 def pop(self, key):
90 90 value = dict.pop(self, key)
91 91 self._reverse.pop(value)
92 92 return value
93 93
94 94 def get(self, key, default=None):
95 95 try:
96 96 return self[key]
97 97 except KeyError:
98 98 return default
99 99
100 100 #-----------------------------------------------------------------------------
101 101 # Functions
102 102 #-----------------------------------------------------------------------------
103 103
104 104 def ensure_bytes(s):
105 """ensure that an object is bytes"""
105 """ensure that an object is ascii bytes"""
106 106 if isinstance(s, unicode):
107 s = s.encode(sys.getdefaultencoding(), 'replace')
107 s = s.encode('ascii')
108 108 return s
109 109
110 110 def validate_url(url):
111 111 """validate a url for zeromq"""
112 112 if not isinstance(url, basestring):
113 113 raise TypeError("url must be a string, not %r"%type(url))
114 114 url = url.lower()
115 115
116 116 proto_addr = url.split('://')
117 117 assert len(proto_addr) == 2, 'Invalid url: %r'%url
118 118 proto, addr = proto_addr
119 119 assert proto in ['tcp','pgm','epgm','ipc','inproc'], "Invalid protocol: %r"%proto
120 120
121 121 # domain pattern adapted from http://www.regexlib.com/REDetails.aspx?regexp_id=391
122 122 # author: Remi Sabourin
123 123 pat = re.compile(r'^([\w\d]([\w\d\-]{0,61}[\w\d])?\.)*[\w\d]([\w\d\-]{0,61}[\w\d])?$')
124 124
125 125 if proto == 'tcp':
126 126 lis = addr.split(':')
127 127 assert len(lis) == 2, 'Invalid url: %r'%url
128 128 addr,s_port = lis
129 129 try:
130 130 port = int(s_port)
131 131 except ValueError:
132 132 raise AssertionError("Invalid port %r in url: %r"%(port, url))
133 133
134 134 assert addr == '*' or pat.match(addr) is not None, 'Invalid url: %r'%url
135 135
136 136 else:
137 137 # only validate tcp urls currently
138 138 pass
139 139
140 140 return True
141 141
142 142
143 143 def validate_url_container(container):
144 144 """validate a potentially nested collection of urls."""
145 145 if isinstance(container, basestring):
146 146 url = container
147 147 return validate_url(url)
148 148 elif isinstance(container, dict):
149 149 container = container.itervalues()
150 150
151 151 for element in container:
152 152 validate_url_container(element)
153 153
154 154
155 155 def split_url(url):
156 156 """split a zmq url (tcp://ip:port) into ('tcp','ip','port')."""
157 157 proto_addr = url.split('://')
158 158 assert len(proto_addr) == 2, 'Invalid url: %r'%url
159 159 proto, addr = proto_addr
160 160 lis = addr.split(':')
161 161 assert len(lis) == 2, 'Invalid url: %r'%url
162 162 addr,s_port = lis
163 163 return proto,addr,s_port
164 164
165 165 def disambiguate_ip_address(ip, location=None):
166 166 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
167 167 ones, based on the location (default interpretation of location is localhost)."""
168 168 if ip in ('0.0.0.0', '*'):
169 169 external_ips = socket.gethostbyname_ex(socket.gethostname())[2]
170 170 if location is None or location in external_ips:
171 171 ip='127.0.0.1'
172 172 elif location:
173 173 return location
174 174 return ip
175 175
176 176 def disambiguate_url(url, location=None):
177 177 """turn multi-ip interfaces '0.0.0.0' and '*' into connectable
178 178 ones, based on the location (default interpretation is localhost).
179 179
180 180 This is for zeromq urls, such as tcp://*:10101."""
181 181 try:
182 182 proto,ip,port = split_url(url)
183 183 except AssertionError:
184 184 # probably not tcp url; could be ipc, etc.
185 185 return url
186 186
187 187 ip = disambiguate_ip_address(ip,location)
188 188
189 189 return "%s://%s:%s"%(proto,ip,port)
190 190
191 191 def serialize_object(obj, threshold=64e-6):
192 192 """Serialize an object into a list of sendable buffers.
193 193
194 194 Parameters
195 195 ----------
196 196
197 197 obj : object
198 198 The object to be serialized
199 199 threshold : float
200 200 The threshold for not double-pickling the content.
201 201
202 202
203 203 Returns
204 204 -------
205 205 ('pmd', [bufs]) :
206 206 where pmd is the pickled metadata wrapper,
207 207 bufs is a list of data buffers
208 208 """
209 209 databuffers = []
210 210 if isinstance(obj, (list, tuple)):
211 211 clist = canSequence(obj)
212 212 slist = map(serialize, clist)
213 213 for s in slist:
214 214 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
215 215 databuffers.append(s.getData())
216 216 s.data = None
217 217 return pickle.dumps(slist,-1), databuffers
218 218 elif isinstance(obj, dict):
219 219 sobj = {}
220 220 for k in sorted(obj.iterkeys()):
221 221 s = serialize(can(obj[k]))
222 222 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
223 223 databuffers.append(s.getData())
224 224 s.data = None
225 225 sobj[k] = s
226 226 return pickle.dumps(sobj,-1),databuffers
227 227 else:
228 228 s = serialize(can(obj))
229 229 if s.typeDescriptor in ('buffer', 'ndarray') or s.getDataSize() > threshold:
230 230 databuffers.append(s.getData())
231 231 s.data = None
232 232 return pickle.dumps(s,-1),databuffers
233 233
234 234
235 235 def unserialize_object(bufs):
236 236 """reconstruct an object serialized by serialize_object from data buffers."""
237 237 bufs = list(bufs)
238 238 sobj = pickle.loads(bufs.pop(0))
239 239 if isinstance(sobj, (list, tuple)):
240 240 for s in sobj:
241 241 if s.data is None:
242 242 s.data = bufs.pop(0)
243 243 return uncanSequence(map(unserialize, sobj)), bufs
244 244 elif isinstance(sobj, dict):
245 245 newobj = {}
246 246 for k in sorted(sobj.iterkeys()):
247 247 s = sobj[k]
248 248 if s.data is None:
249 249 s.data = bufs.pop(0)
250 250 newobj[k] = uncan(unserialize(s))
251 251 return newobj, bufs
252 252 else:
253 253 if sobj.data is None:
254 254 sobj.data = bufs.pop(0)
255 255 return uncan(unserialize(sobj)), bufs
256 256
257 257 def pack_apply_message(f, args, kwargs, threshold=64e-6):
258 258 """pack up a function, args, and kwargs to be sent over the wire
259 259 as a series of buffers. Any object whose data is larger than `threshold`
260 260 will not have their data copied (currently only numpy arrays support zero-copy)"""
261 261 msg = [pickle.dumps(can(f),-1)]
262 262 databuffers = [] # for large objects
263 263 sargs, bufs = serialize_object(args,threshold)
264 264 msg.append(sargs)
265 265 databuffers.extend(bufs)
266 266 skwargs, bufs = serialize_object(kwargs,threshold)
267 267 msg.append(skwargs)
268 268 databuffers.extend(bufs)
269 269 msg.extend(databuffers)
270 270 return msg
271 271
272 272 def unpack_apply_message(bufs, g=None, copy=True):
273 273 """unpack f,args,kwargs from buffers packed by pack_apply_message()
274 274 Returns: original f,args,kwargs"""
275 275 bufs = list(bufs) # allow us to pop
276 276 assert len(bufs) >= 3, "not enough buffers!"
277 277 if not copy:
278 278 for i in range(3):
279 279 bufs[i] = bufs[i].bytes
280 280 cf = pickle.loads(bufs.pop(0))
281 281 sargs = list(pickle.loads(bufs.pop(0)))
282 282 skwargs = dict(pickle.loads(bufs.pop(0)))
283 283 # print sargs, skwargs
284 284 f = uncan(cf, g)
285 285 for sa in sargs:
286 286 if sa.data is None:
287 287 m = bufs.pop(0)
288 288 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
289 289 # always use a buffer, until memoryviews get sorted out
290 290 sa.data = buffer(m)
291 291 # disable memoryview support
292 292 # if copy:
293 293 # sa.data = buffer(m)
294 294 # else:
295 295 # sa.data = m.buffer
296 296 else:
297 297 if copy:
298 298 sa.data = m
299 299 else:
300 300 sa.data = m.bytes
301 301
302 302 args = uncanSequence(map(unserialize, sargs), g)
303 303 kwargs = {}
304 304 for k in sorted(skwargs.iterkeys()):
305 305 sa = skwargs[k]
306 306 if sa.data is None:
307 307 m = bufs.pop(0)
308 308 if sa.getTypeDescriptor() in ('buffer', 'ndarray'):
309 309 # always use a buffer, until memoryviews get sorted out
310 310 sa.data = buffer(m)
311 311 # disable memoryview support
312 312 # if copy:
313 313 # sa.data = buffer(m)
314 314 # else:
315 315 # sa.data = m.buffer
316 316 else:
317 317 if copy:
318 318 sa.data = m
319 319 else:
320 320 sa.data = m.bytes
321 321
322 322 kwargs[k] = uncan(unserialize(sa), g)
323 323
324 324 return f,args,kwargs
325 325
326 326 #--------------------------------------------------------------------------
327 327 # helpers for implementing old MEC API via view.apply
328 328 #--------------------------------------------------------------------------
329 329
330 330 def interactive(f):
331 331 """decorator for making functions appear as interactively defined.
332 332 This results in the function being linked to the user_ns as globals()
333 333 instead of the module globals().
334 334 """
335 335 f.__module__ = '__main__'
336 336 return f
337 337
338 338 @interactive
339 339 def _push(ns):
340 340 """helper method for implementing `client.push` via `client.apply`"""
341 341 globals().update(ns)
342 342
343 343 @interactive
344 344 def _pull(keys):
345 345 """helper method for implementing `client.pull` via `client.apply`"""
346 346 user_ns = globals()
347 347 if isinstance(keys, (list,tuple, set)):
348 348 for key in keys:
349 349 if not user_ns.has_key(key):
350 350 raise NameError("name '%s' is not defined"%key)
351 351 return map(user_ns.get, keys)
352 352 else:
353 353 if not user_ns.has_key(keys):
354 354 raise NameError("name '%s' is not defined"%keys)
355 355 return user_ns.get(keys)
356 356
357 357 @interactive
358 358 def _execute(code):
359 359 """helper method for implementing `client.execute` via `client.apply`"""
360 360 exec code in globals()
361 361
362 362 #--------------------------------------------------------------------------
363 363 # extra process management utilities
364 364 #--------------------------------------------------------------------------
365 365
366 366 _random_ports = set()
367 367
368 368 def select_random_ports(n):
369 369 """Selects and return n random ports that are available."""
370 370 ports = []
371 371 for i in xrange(n):
372 372 sock = socket.socket()
373 373 sock.bind(('', 0))
374 374 while sock.getsockname()[1] in _random_ports:
375 375 sock.close()
376 376 sock = socket.socket()
377 377 sock.bind(('', 0))
378 378 ports.append(sock)
379 379 for i, sock in enumerate(ports):
380 380 port = sock.getsockname()[1]
381 381 sock.close()
382 382 ports[i] = port
383 383 _random_ports.add(port)
384 384 return ports
385 385
386 386 def signal_children(children):
387 387 """Relay interupt/term signals to children, for more solid process cleanup."""
388 388 def terminate_children(sig, frame):
389 389 logging.critical("Got signal %i, terminating children..."%sig)
390 390 for child in children:
391 391 child.terminate()
392 392
393 393 sys.exit(sig != SIGINT)
394 394 # sys.exit(sig)
395 395 for sig in (SIGINT, SIGABRT, SIGTERM):
396 396 signal(sig, terminate_children)
397 397
398 398 def generate_exec_key(keyfile):
399 399 import uuid
400 400 newkey = str(uuid.uuid4())
401 401 with open(keyfile, 'w') as f:
402 402 # f.write('ipython-key ')
403 403 f.write(newkey+'\n')
404 404 # set user-only RW permissions (0600)
405 405 # this will have no effect on Windows
406 406 os.chmod(keyfile, stat.S_IRUSR|stat.S_IWUSR)
407 407
408 408
409 409 def integer_loglevel(loglevel):
410 410 try:
411 411 loglevel = int(loglevel)
412 412 except ValueError:
413 413 if isinstance(loglevel, str):
414 414 loglevel = getattr(logging, loglevel)
415 415 return loglevel
416 416
417 417 def connect_logger(logname, context, iface, root="ip", loglevel=logging.DEBUG):
418 418 logger = logging.getLogger(logname)
419 419 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
420 420 # don't add a second PUBHandler
421 421 return
422 422 loglevel = integer_loglevel(loglevel)
423 423 lsock = context.socket(zmq.PUB)
424 424 lsock.connect(iface)
425 425 handler = handlers.PUBHandler(lsock)
426 426 handler.setLevel(loglevel)
427 427 handler.root_topic = root
428 428 logger.addHandler(handler)
429 429 logger.setLevel(loglevel)
430 430
431 431 def connect_engine_logger(context, iface, engine, loglevel=logging.DEBUG):
432 432 logger = logging.getLogger()
433 433 if any([isinstance(h, handlers.PUBHandler) for h in logger.handlers]):
434 434 # don't add a second PUBHandler
435 435 return
436 436 loglevel = integer_loglevel(loglevel)
437 437 lsock = context.socket(zmq.PUB)
438 438 lsock.connect(iface)
439 439 handler = EnginePUBHandler(engine, lsock)
440 440 handler.setLevel(loglevel)
441 441 logger.addHandler(handler)
442 442 logger.setLevel(loglevel)
443 443 return logger
444 444
445 445 def local_logger(logname, loglevel=logging.DEBUG):
446 446 loglevel = integer_loglevel(loglevel)
447 447 logger = logging.getLogger(logname)
448 448 if any([isinstance(h, logging.StreamHandler) for h in logger.handlers]):
449 449 # don't add a second StreamHandler
450 450 return
451 451 handler = logging.StreamHandler()
452 452 handler.setLevel(loglevel)
453 453 logger.addHandler(handler)
454 454 logger.setLevel(loglevel)
455 455 return logger
456 456
General Comments 0
You need to be logged in to leave comments. Login now