##// END OF EJS Templates
enables resume of ipcontroller...
MinRK -
Show More
@@ -1,509 +1,528 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 """
3 """
4 The IPython controller application.
4 The IPython controller application.
5
5
6 Authors:
6 Authors:
7
7
8 * Brian Granger
8 * Brian Granger
9 * MinRK
9 * MinRK
10
10
11 """
11 """
12
12
13 #-----------------------------------------------------------------------------
13 #-----------------------------------------------------------------------------
14 # Copyright (C) 2008-2011 The IPython Development Team
14 # Copyright (C) 2008-2011 The IPython Development Team
15 #
15 #
16 # Distributed under the terms of the BSD License. The full license is in
16 # Distributed under the terms of the BSD License. The full license is in
17 # the file COPYING, distributed as part of this software.
17 # the file COPYING, distributed as part of this software.
18 #-----------------------------------------------------------------------------
18 #-----------------------------------------------------------------------------
19
19
20 #-----------------------------------------------------------------------------
20 #-----------------------------------------------------------------------------
21 # Imports
21 # Imports
22 #-----------------------------------------------------------------------------
22 #-----------------------------------------------------------------------------
23
23
24 from __future__ import with_statement
24 from __future__ import with_statement
25
25
26 import json
26 import json
27 import os
27 import os
28 import socket
28 import socket
29 import stat
29 import stat
30 import sys
30 import sys
31
31
32 from multiprocessing import Process
32 from multiprocessing import Process
33 from signal import signal, SIGINT, SIGABRT, SIGTERM
33 from signal import signal, SIGINT, SIGABRT, SIGTERM
34
34
35 import zmq
35 import zmq
36 from zmq.devices import ProcessMonitoredQueue
36 from zmq.devices import ProcessMonitoredQueue
37 from zmq.log.handlers import PUBHandler
37 from zmq.log.handlers import PUBHandler
38
38
39 from IPython.core.profiledir import ProfileDir
39 from IPython.core.profiledir import ProfileDir
40
40
41 from IPython.parallel.apps.baseapp import (
41 from IPython.parallel.apps.baseapp import (
42 BaseParallelApplication,
42 BaseParallelApplication,
43 base_aliases,
43 base_aliases,
44 base_flags,
44 base_flags,
45 catch_config_error,
45 catch_config_error,
46 )
46 )
47 from IPython.utils.importstring import import_item
47 from IPython.utils.importstring import import_item
48 from IPython.utils.traitlets import Instance, Unicode, Bool, List, Dict, TraitError
48 from IPython.utils.traitlets import Instance, Unicode, Bool, List, Dict, TraitError
49
49
50 from IPython.zmq.session import (
50 from IPython.zmq.session import (
51 Session, session_aliases, session_flags, default_secure
51 Session, session_aliases, session_flags, default_secure
52 )
52 )
53
53
54 from IPython.parallel.controller.heartmonitor import HeartMonitor
54 from IPython.parallel.controller.heartmonitor import HeartMonitor
55 from IPython.parallel.controller.hub import HubFactory
55 from IPython.parallel.controller.hub import HubFactory
56 from IPython.parallel.controller.scheduler import TaskScheduler,launch_scheduler
56 from IPython.parallel.controller.scheduler import TaskScheduler,launch_scheduler
57 from IPython.parallel.controller.sqlitedb import SQLiteDB
57 from IPython.parallel.controller.sqlitedb import SQLiteDB
58
58
59 from IPython.parallel.util import split_url, disambiguate_url
59 from IPython.parallel.util import split_url, disambiguate_url
60
60
61 # conditional import of MongoDB backend class
61 # conditional import of MongoDB backend class
62
62
63 try:
63 try:
64 from IPython.parallel.controller.mongodb import MongoDB
64 from IPython.parallel.controller.mongodb import MongoDB
65 except ImportError:
65 except ImportError:
66 maybe_mongo = []
66 maybe_mongo = []
67 else:
67 else:
68 maybe_mongo = [MongoDB]
68 maybe_mongo = [MongoDB]
69
69
70
70
71 #-----------------------------------------------------------------------------
71 #-----------------------------------------------------------------------------
72 # Module level variables
72 # Module level variables
73 #-----------------------------------------------------------------------------
73 #-----------------------------------------------------------------------------
74
74
75
75
76 #: The default config file name for this application
76 #: The default config file name for this application
77 default_config_file_name = u'ipcontroller_config.py'
77 default_config_file_name = u'ipcontroller_config.py'
78
78
79
79
80 _description = """Start the IPython controller for parallel computing.
80 _description = """Start the IPython controller for parallel computing.
81
81
82 The IPython controller provides a gateway between the IPython engines and
82 The IPython controller provides a gateway between the IPython engines and
83 clients. The controller needs to be started before the engines and can be
83 clients. The controller needs to be started before the engines and can be
84 configured using command line options or using a cluster directory. Cluster
84 configured using command line options or using a cluster directory. Cluster
85 directories contain config, log and security files and are usually located in
85 directories contain config, log and security files and are usually located in
86 your ipython directory and named as "profile_name". See the `profile`
86 your ipython directory and named as "profile_name". See the `profile`
87 and `profile-dir` options for details.
87 and `profile-dir` options for details.
88 """
88 """
89
89
90 _examples = """
90 _examples = """
91 ipcontroller --ip=192.168.0.1 --port=1000 # listen on ip, port for engines
91 ipcontroller --ip=192.168.0.1 --port=1000 # listen on ip, port for engines
92 ipcontroller --scheme=pure # use the pure zeromq scheduler
92 ipcontroller --scheme=pure # use the pure zeromq scheduler
93 """
93 """
94
94
95
95
96 #-----------------------------------------------------------------------------
96 #-----------------------------------------------------------------------------
97 # The main application
97 # The main application
98 #-----------------------------------------------------------------------------
98 #-----------------------------------------------------------------------------
99 flags = {}
99 flags = {}
100 flags.update(base_flags)
100 flags.update(base_flags)
101 flags.update({
101 flags.update({
102 'usethreads' : ( {'IPControllerApp' : {'use_threads' : True}},
102 'usethreads' : ( {'IPControllerApp' : {'use_threads' : True}},
103 'Use threads instead of processes for the schedulers'),
103 'Use threads instead of processes for the schedulers'),
104 'sqlitedb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.sqlitedb.SQLiteDB'}},
104 'sqlitedb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.sqlitedb.SQLiteDB'}},
105 'use the SQLiteDB backend'),
105 'use the SQLiteDB backend'),
106 'mongodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.mongodb.MongoDB'}},
106 'mongodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.mongodb.MongoDB'}},
107 'use the MongoDB backend'),
107 'use the MongoDB backend'),
108 'dictdb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.DictDB'}},
108 'dictdb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.DictDB'}},
109 'use the in-memory DictDB backend'),
109 'use the in-memory DictDB backend'),
110 'nodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.NoDB'}},
110 'nodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.NoDB'}},
111 """use dummy DB backend, which doesn't store any information.
111 """use dummy DB backend, which doesn't store any information.
112
112
113 This is the default as of IPython 0.13.
113 This is the default as of IPython 0.13.
114
114
115 To enable delayed or repeated retrieval of results from the Hub,
115 To enable delayed or repeated retrieval of results from the Hub,
116 select one of the true db backends.
116 select one of the true db backends.
117 """),
117 """),
118 'reuse' : ({'IPControllerApp' : {'reuse_files' : True}},
118 'reuse' : ({'IPControllerApp' : {'reuse_files' : True}},
119 'reuse existing json connection files')
119 'reuse existing json connection files'),
120 'restore' : ({'IPControllerApp' : {'restore_engines' : True, 'reuse_files' : True}},
121 'Attempt to restore engines from a JSON file. '
122 'For use when resuming a crashed controller'),
120 })
123 })
121
124
122 flags.update(session_flags)
125 flags.update(session_flags)
123
126
124 aliases = dict(
127 aliases = dict(
125 ssh = 'IPControllerApp.ssh_server',
128 ssh = 'IPControllerApp.ssh_server',
126 enginessh = 'IPControllerApp.engine_ssh_server',
129 enginessh = 'IPControllerApp.engine_ssh_server',
127 location = 'IPControllerApp.location',
130 location = 'IPControllerApp.location',
128
131
129 url = 'HubFactory.url',
132 url = 'HubFactory.url',
130 ip = 'HubFactory.ip',
133 ip = 'HubFactory.ip',
131 transport = 'HubFactory.transport',
134 transport = 'HubFactory.transport',
132 port = 'HubFactory.regport',
135 port = 'HubFactory.regport',
133
136
134 ping = 'HeartMonitor.period',
137 ping = 'HeartMonitor.period',
135
138
136 scheme = 'TaskScheduler.scheme_name',
139 scheme = 'TaskScheduler.scheme_name',
137 hwm = 'TaskScheduler.hwm',
140 hwm = 'TaskScheduler.hwm',
138 )
141 )
139 aliases.update(base_aliases)
142 aliases.update(base_aliases)
140 aliases.update(session_aliases)
143 aliases.update(session_aliases)
141
144
142 class IPControllerApp(BaseParallelApplication):
145 class IPControllerApp(BaseParallelApplication):
143
146
144 name = u'ipcontroller'
147 name = u'ipcontroller'
145 description = _description
148 description = _description
146 examples = _examples
149 examples = _examples
147 config_file_name = Unicode(default_config_file_name)
150 config_file_name = Unicode(default_config_file_name)
148 classes = [ProfileDir, Session, HubFactory, TaskScheduler, HeartMonitor, SQLiteDB] + maybe_mongo
151 classes = [ProfileDir, Session, HubFactory, TaskScheduler, HeartMonitor, SQLiteDB] + maybe_mongo
149
152
150 # change default to True
153 # change default to True
151 auto_create = Bool(True, config=True,
154 auto_create = Bool(True, config=True,
152 help="""Whether to create profile dir if it doesn't exist.""")
155 help="""Whether to create profile dir if it doesn't exist.""")
153
156
154 reuse_files = Bool(False, config=True,
157 reuse_files = Bool(False, config=True,
155 help="""Whether to reuse existing json connection files.
158 help="""Whether to reuse existing json connection files.
156 If False, connection files will be removed on a clean exit.
159 If False, connection files will be removed on a clean exit.
157 """
160 """
158 )
161 )
162 restore_engines = Bool(False, config=True,
163 help="""Reload engine state from JSON file
164 """
165 )
159 ssh_server = Unicode(u'', config=True,
166 ssh_server = Unicode(u'', config=True,
160 help="""ssh url for clients to use when connecting to the Controller
167 help="""ssh url for clients to use when connecting to the Controller
161 processes. It should be of the form: [user@]server[:port]. The
168 processes. It should be of the form: [user@]server[:port]. The
162 Controller's listening addresses must be accessible from the ssh server""",
169 Controller's listening addresses must be accessible from the ssh server""",
163 )
170 )
164 engine_ssh_server = Unicode(u'', config=True,
171 engine_ssh_server = Unicode(u'', config=True,
165 help="""ssh url for engines to use when connecting to the Controller
172 help="""ssh url for engines to use when connecting to the Controller
166 processes. It should be of the form: [user@]server[:port]. The
173 processes. It should be of the form: [user@]server[:port]. The
167 Controller's listening addresses must be accessible from the ssh server""",
174 Controller's listening addresses must be accessible from the ssh server""",
168 )
175 )
169 location = Unicode(u'', config=True,
176 location = Unicode(u'', config=True,
170 help="""The external IP or domain name of the Controller, used for disambiguating
177 help="""The external IP or domain name of the Controller, used for disambiguating
171 engine and client connections.""",
178 engine and client connections.""",
172 )
179 )
173 import_statements = List([], config=True,
180 import_statements = List([], config=True,
174 help="import statements to be run at startup. Necessary in some environments"
181 help="import statements to be run at startup. Necessary in some environments"
175 )
182 )
176
183
177 use_threads = Bool(False, config=True,
184 use_threads = Bool(False, config=True,
178 help='Use threads instead of processes for the schedulers',
185 help='Use threads instead of processes for the schedulers',
179 )
186 )
180
187
181 engine_json_file = Unicode('ipcontroller-engine.json', config=True,
188 engine_json_file = Unicode('ipcontroller-engine.json', config=True,
182 help="JSON filename where engine connection info will be stored.")
189 help="JSON filename where engine connection info will be stored.")
183 client_json_file = Unicode('ipcontroller-client.json', config=True,
190 client_json_file = Unicode('ipcontroller-client.json', config=True,
184 help="JSON filename where client connection info will be stored.")
191 help="JSON filename where client connection info will be stored.")
185
192
186 def _cluster_id_changed(self, name, old, new):
193 def _cluster_id_changed(self, name, old, new):
187 super(IPControllerApp, self)._cluster_id_changed(name, old, new)
194 super(IPControllerApp, self)._cluster_id_changed(name, old, new)
188 self.engine_json_file = "%s-engine.json" % self.name
195 self.engine_json_file = "%s-engine.json" % self.name
189 self.client_json_file = "%s-client.json" % self.name
196 self.client_json_file = "%s-client.json" % self.name
190
197
191
198
192 # internal
199 # internal
193 children = List()
200 children = List()
194 mq_class = Unicode('zmq.devices.ProcessMonitoredQueue')
201 mq_class = Unicode('zmq.devices.ProcessMonitoredQueue')
195
202
196 def _use_threads_changed(self, name, old, new):
203 def _use_threads_changed(self, name, old, new):
197 self.mq_class = 'zmq.devices.%sMonitoredQueue'%('Thread' if new else 'Process')
204 self.mq_class = 'zmq.devices.%sMonitoredQueue'%('Thread' if new else 'Process')
198
205
199 write_connection_files = Bool(True,
206 write_connection_files = Bool(True,
200 help="""Whether to write connection files to disk.
207 help="""Whether to write connection files to disk.
201 True in all cases other than runs with `reuse_files=True` *after the first*
208 True in all cases other than runs with `reuse_files=True` *after the first*
202 """
209 """
203 )
210 )
204
211
205 aliases = Dict(aliases)
212 aliases = Dict(aliases)
206 flags = Dict(flags)
213 flags = Dict(flags)
207
214
208
215
209 def save_connection_dict(self, fname, cdict):
216 def save_connection_dict(self, fname, cdict):
210 """save a connection dict to json file."""
217 """save a connection dict to json file."""
211 c = self.config
218 c = self.config
212 url = cdict['registration']
219 url = cdict['registration']
213 location = cdict['location']
220 location = cdict['location']
214
221
215 if not location:
222 if not location:
216 try:
223 try:
217 location = socket.gethostbyname_ex(socket.gethostname())[2][-1]
224 location = socket.gethostbyname_ex(socket.gethostname())[2][-1]
218 except (socket.gaierror, IndexError):
225 except (socket.gaierror, IndexError):
219 self.log.warn("Could not identify this machine's IP, assuming 127.0.0.1."
226 self.log.warn("Could not identify this machine's IP, assuming 127.0.0.1."
220 " You may need to specify '--location=<external_ip_address>' to help"
227 " You may need to specify '--location=<external_ip_address>' to help"
221 " IPython decide when to connect via loopback.")
228 " IPython decide when to connect via loopback.")
222 location = '127.0.0.1'
229 location = '127.0.0.1'
223 cdict['location'] = location
230 cdict['location'] = location
224 fname = os.path.join(self.profile_dir.security_dir, fname)
231 fname = os.path.join(self.profile_dir.security_dir, fname)
225 self.log.info("writing connection info to %s", fname)
232 self.log.info("writing connection info to %s", fname)
226 with open(fname, 'w') as f:
233 with open(fname, 'w') as f:
227 f.write(json.dumps(cdict, indent=2))
234 f.write(json.dumps(cdict, indent=2))
228 os.chmod(fname, stat.S_IRUSR|stat.S_IWUSR)
235 os.chmod(fname, stat.S_IRUSR|stat.S_IWUSR)
229
236
230 def load_config_from_json(self):
237 def load_config_from_json(self):
231 """load config from existing json connector files."""
238 """load config from existing json connector files."""
232 c = self.config
239 c = self.config
233 self.log.debug("loading config from JSON")
240 self.log.debug("loading config from JSON")
234
241
235 # load engine config
242 # load engine config
236
243
237 fname = os.path.join(self.profile_dir.security_dir, self.engine_json_file)
244 fname = os.path.join(self.profile_dir.security_dir, self.engine_json_file)
238 self.log.info("loading connection info from %s", fname)
245 self.log.info("loading connection info from %s", fname)
239 with open(fname) as f:
246 with open(fname) as f:
240 ecfg = json.loads(f.read())
247 ecfg = json.loads(f.read())
241
248
242 # json gives unicode, Session.key wants bytes
249 # json gives unicode, Session.key wants bytes
243 c.Session.key = ecfg['exec_key'].encode('ascii')
250 c.Session.key = ecfg['exec_key'].encode('ascii')
244
251
245 xport,ip = ecfg['interface'].split('://')
252 xport,ip = ecfg['interface'].split('://')
246
253
247 c.HubFactory.engine_ip = ip
254 c.HubFactory.engine_ip = ip
248 c.HubFactory.engine_transport = xport
255 c.HubFactory.engine_transport = xport
249
256
250 self.location = ecfg['location']
257 self.location = ecfg['location']
251 if not self.engine_ssh_server:
258 if not self.engine_ssh_server:
252 self.engine_ssh_server = ecfg['ssh']
259 self.engine_ssh_server = ecfg['ssh']
253
260
254 # load client config
261 # load client config
255
262
256 fname = os.path.join(self.profile_dir.security_dir, self.client_json_file)
263 fname = os.path.join(self.profile_dir.security_dir, self.client_json_file)
257 self.log.info("loading connection info from %s", fname)
264 self.log.info("loading connection info from %s", fname)
258 with open(fname) as f:
265 with open(fname) as f:
259 ccfg = json.loads(f.read())
266 ccfg = json.loads(f.read())
260
267
261 for key in ('exec_key', 'registration', 'pack', 'unpack'):
268 for key in ('exec_key', 'registration', 'pack', 'unpack'):
262 assert ccfg[key] == ecfg[key], "mismatch between engine and client info: %r" % key
269 assert ccfg[key] == ecfg[key], "mismatch between engine and client info: %r" % key
263
270
264 xport,addr = ccfg['interface'].split('://')
271 xport,addr = ccfg['interface'].split('://')
265
272
266 c.HubFactory.client_transport = xport
273 c.HubFactory.client_transport = xport
267 c.HubFactory.client_ip = ip
274 c.HubFactory.client_ip = ip
268 if not self.ssh_server:
275 if not self.ssh_server:
269 self.ssh_server = ccfg['ssh']
276 self.ssh_server = ccfg['ssh']
270
277
271 # load port config:
278 # load port config:
272 c.HubFactory.regport = ecfg['registration']
279 c.HubFactory.regport = ecfg['registration']
273 c.HubFactory.hb = (ecfg['hb_ping'], ecfg['hb_pong'])
280 c.HubFactory.hb = (ecfg['hb_ping'], ecfg['hb_pong'])
274 c.HubFactory.control = (ccfg['control'], ecfg['control'])
281 c.HubFactory.control = (ccfg['control'], ecfg['control'])
275 c.HubFactory.mux = (ccfg['mux'], ecfg['mux'])
282 c.HubFactory.mux = (ccfg['mux'], ecfg['mux'])
276 c.HubFactory.task = (ccfg['task'], ecfg['task'])
283 c.HubFactory.task = (ccfg['task'], ecfg['task'])
277 c.HubFactory.iopub = (ccfg['iopub'], ecfg['iopub'])
284 c.HubFactory.iopub = (ccfg['iopub'], ecfg['iopub'])
278 c.HubFactory.notifier_port = ccfg['notification']
285 c.HubFactory.notifier_port = ccfg['notification']
279
286
280 def cleanup_connection_files(self):
287 def cleanup_connection_files(self):
281 if self.reuse_files:
288 if self.reuse_files:
282 self.log.debug("leaving JSON connection files for reuse")
289 self.log.debug("leaving JSON connection files for reuse")
283 return
290 return
284 self.log.debug("cleaning up JSON connection files")
291 self.log.debug("cleaning up JSON connection files")
285 for f in (self.client_json_file, self.engine_json_file):
292 for f in (self.client_json_file, self.engine_json_file):
286 f = os.path.join(self.profile_dir.security_dir, f)
293 f = os.path.join(self.profile_dir.security_dir, f)
287 try:
294 try:
288 os.remove(f)
295 os.remove(f)
289 except Exception as e:
296 except Exception as e:
290 self.log.error("Failed to cleanup connection file: %s", e)
297 self.log.error("Failed to cleanup connection file: %s", e)
291 else:
298 else:
292 self.log.debug(u"removed %s", f)
299 self.log.debug(u"removed %s", f)
293
300
294 def load_secondary_config(self):
301 def load_secondary_config(self):
295 """secondary config, loading from JSON and setting defaults"""
302 """secondary config, loading from JSON and setting defaults"""
296 if self.reuse_files:
303 if self.reuse_files:
297 try:
304 try:
298 self.load_config_from_json()
305 self.load_config_from_json()
299 except (AssertionError,IOError) as e:
306 except (AssertionError,IOError) as e:
300 self.log.error("Could not load config from JSON: %s" % e)
307 self.log.error("Could not load config from JSON: %s" % e)
301 else:
308 else:
302 # successfully loaded config from JSON, and reuse=True
309 # successfully loaded config from JSON, and reuse=True
303 # no need to wite back the same file
310 # no need to wite back the same file
304 self.write_connection_files = False
311 self.write_connection_files = False
305
312
306 # switch Session.key default to secure
313 # switch Session.key default to secure
307 default_secure(self.config)
314 default_secure(self.config)
308 self.log.debug("Config changed")
315 self.log.debug("Config changed")
309 self.log.debug(repr(self.config))
316 self.log.debug(repr(self.config))
310
317
311 def init_hub(self):
318 def init_hub(self):
312 c = self.config
319 c = self.config
313
320
314 self.do_import_statements()
321 self.do_import_statements()
315
322
316 try:
323 try:
317 self.factory = HubFactory(config=c, log=self.log)
324 self.factory = HubFactory(config=c, log=self.log)
318 # self.start_logging()
325 # self.start_logging()
319 self.factory.init_hub()
326 self.factory.init_hub()
320 except TraitError:
327 except TraitError:
321 raise
328 raise
322 except Exception:
329 except Exception:
323 self.log.error("Couldn't construct the Controller", exc_info=True)
330 self.log.error("Couldn't construct the Controller", exc_info=True)
324 self.exit(1)
331 self.exit(1)
325
332
326 if self.write_connection_files:
333 if self.write_connection_files:
327 # save to new json config files
334 # save to new json config files
328 f = self.factory
335 f = self.factory
329 base = {
336 base = {
330 'exec_key' : f.session.key.decode('ascii'),
337 'exec_key' : f.session.key.decode('ascii'),
331 'location' : self.location,
338 'location' : self.location,
332 'pack' : f.session.packer,
339 'pack' : f.session.packer,
333 'unpack' : f.session.unpacker,
340 'unpack' : f.session.unpacker,
334 }
341 }
335
342
336 cdict = {'ssh' : self.ssh_server}
343 cdict = {'ssh' : self.ssh_server}
337 cdict.update(f.client_info)
344 cdict.update(f.client_info)
338 cdict.update(base)
345 cdict.update(base)
339 self.save_connection_dict(self.client_json_file, cdict)
346 self.save_connection_dict(self.client_json_file, cdict)
340
347
341 edict = {'ssh' : self.engine_ssh_server}
348 edict = {'ssh' : self.engine_ssh_server}
342 edict.update(f.engine_info)
349 edict.update(f.engine_info)
343 edict.update(base)
350 edict.update(base)
344 self.save_connection_dict(self.engine_json_file, edict)
351 self.save_connection_dict(self.engine_json_file, edict)
345
352
353 fname = "engines%s.json" % self.cluster_id
354 self.factory.hub.engine_state_file = os.path.join(self.profile_dir.log_dir, fname)
355 if self.restore_engines:
356 self.factory.hub._load_engine_state()
357
346 def init_schedulers(self):
358 def init_schedulers(self):
347 children = self.children
359 children = self.children
348 mq = import_item(str(self.mq_class))
360 mq = import_item(str(self.mq_class))
349
361
350 f = self.factory
362 f = self.factory
363 ident = f.session.bsession
351 # disambiguate url, in case of *
364 # disambiguate url, in case of *
352 monitor_url = disambiguate_url(f.monitor_url)
365 monitor_url = disambiguate_url(f.monitor_url)
353 # maybe_inproc = 'inproc://monitor' if self.use_threads else monitor_url
366 # maybe_inproc = 'inproc://monitor' if self.use_threads else monitor_url
354 # IOPub relay (in a Process)
367 # IOPub relay (in a Process)
355 q = mq(zmq.PUB, zmq.SUB, zmq.PUB, b'N/A',b'iopub')
368 q = mq(zmq.PUB, zmq.SUB, zmq.PUB, b'N/A',b'iopub')
356 q.bind_in(f.client_url('iopub'))
369 q.bind_in(f.client_url('iopub'))
370 q.setsockopt_in(zmq.IDENTITY, ident+"_iopub")
357 q.bind_out(f.engine_url('iopub'))
371 q.bind_out(f.engine_url('iopub'))
358 q.setsockopt_out(zmq.SUBSCRIBE, b'')
372 q.setsockopt_out(zmq.SUBSCRIBE, b'')
359 q.connect_mon(monitor_url)
373 q.connect_mon(monitor_url)
360 q.daemon=True
374 q.daemon=True
361 children.append(q)
375 children.append(q)
362
376
363 # Multiplexer Queue (in a Process)
377 # Multiplexer Queue (in a Process)
364 q = mq(zmq.ROUTER, zmq.ROUTER, zmq.PUB, b'in', b'out')
378 q = mq(zmq.ROUTER, zmq.ROUTER, zmq.PUB, b'in', b'out')
365 q.bind_in(f.client_url('mux'))
379 q.bind_in(f.client_url('mux'))
366 q.setsockopt_in(zmq.IDENTITY, b'mux')
380 q.setsockopt_in(zmq.IDENTITY, b'mux_in')
367 q.bind_out(f.engine_url('mux'))
381 q.bind_out(f.engine_url('mux'))
382 q.setsockopt_out(zmq.IDENTITY, b'mux_out')
368 q.connect_mon(monitor_url)
383 q.connect_mon(monitor_url)
369 q.daemon=True
384 q.daemon=True
370 children.append(q)
385 children.append(q)
371
386
372 # Control Queue (in a Process)
387 # Control Queue (in a Process)
373 q = mq(zmq.ROUTER, zmq.ROUTER, zmq.PUB, b'incontrol', b'outcontrol')
388 q = mq(zmq.ROUTER, zmq.ROUTER, zmq.PUB, b'incontrol', b'outcontrol')
374 q.bind_in(f.client_url('control'))
389 q.bind_in(f.client_url('control'))
375 q.setsockopt_in(zmq.IDENTITY, b'control')
390 q.setsockopt_in(zmq.IDENTITY, b'control_in')
376 q.bind_out(f.engine_url('control'))
391 q.bind_out(f.engine_url('control'))
392 q.setsockopt_out(zmq.IDENTITY, b'control_out')
377 q.connect_mon(monitor_url)
393 q.connect_mon(monitor_url)
378 q.daemon=True
394 q.daemon=True
379 children.append(q)
395 children.append(q)
380 try:
396 try:
381 scheme = self.config.TaskScheduler.scheme_name
397 scheme = self.config.TaskScheduler.scheme_name
382 except AttributeError:
398 except AttributeError:
383 scheme = TaskScheduler.scheme_name.get_default_value()
399 scheme = TaskScheduler.scheme_name.get_default_value()
384 # Task Queue (in a Process)
400 # Task Queue (in a Process)
385 if scheme == 'pure':
401 if scheme == 'pure':
386 self.log.warn("task::using pure DEALER Task scheduler")
402 self.log.warn("task::using pure DEALER Task scheduler")
387 q = mq(zmq.ROUTER, zmq.DEALER, zmq.PUB, b'intask', b'outtask')
403 q = mq(zmq.ROUTER, zmq.DEALER, zmq.PUB, b'intask', b'outtask')
388 # q.setsockopt_out(zmq.HWM, hub.hwm)
404 # q.setsockopt_out(zmq.HWM, hub.hwm)
389 q.bind_in(f.client_url('task'))
405 q.bind_in(f.client_url('task'))
390 q.setsockopt_in(zmq.IDENTITY, b'task')
406 q.setsockopt_in(zmq.IDENTITY, b'task_in')
391 q.bind_out(f.engine_url('task'))
407 q.bind_out(f.engine_url('task'))
408 q.setsockopt_out(zmq.IDENTITY, b'task_out')
392 q.connect_mon(monitor_url)
409 q.connect_mon(monitor_url)
393 q.daemon=True
410 q.daemon=True
394 children.append(q)
411 children.append(q)
395 elif scheme == 'none':
412 elif scheme == 'none':
396 self.log.warn("task::using no Task scheduler")
413 self.log.warn("task::using no Task scheduler")
397
414
398 else:
415 else:
399 self.log.info("task::using Python %s Task scheduler"%scheme)
416 self.log.info("task::using Python %s Task scheduler"%scheme)
400 sargs = (f.client_url('task'), f.engine_url('task'),
417 sargs = (f.client_url('task'), f.engine_url('task'),
401 monitor_url, disambiguate_url(f.client_url('notification')))
418 monitor_url, disambiguate_url(f.client_url('notification')),
419 disambiguate_url(f.client_url('registration')),
420 )
402 kwargs = dict(logname='scheduler', loglevel=self.log_level,
421 kwargs = dict(logname='scheduler', loglevel=self.log_level,
403 log_url = self.log_url, config=dict(self.config))
422 log_url = self.log_url, config=dict(self.config))
404 if 'Process' in self.mq_class:
423 if 'Process' in self.mq_class:
405 # run the Python scheduler in a Process
424 # run the Python scheduler in a Process
406 q = Process(target=launch_scheduler, args=sargs, kwargs=kwargs)
425 q = Process(target=launch_scheduler, args=sargs, kwargs=kwargs)
407 q.daemon=True
426 q.daemon=True
408 children.append(q)
427 children.append(q)
409 else:
428 else:
410 # single-threaded Controller
429 # single-threaded Controller
411 kwargs['in_thread'] = True
430 kwargs['in_thread'] = True
412 launch_scheduler(*sargs, **kwargs)
431 launch_scheduler(*sargs, **kwargs)
413
432
414 def terminate_children(self):
433 def terminate_children(self):
415 child_procs = []
434 child_procs = []
416 for child in self.children:
435 for child in self.children:
417 if isinstance(child, ProcessMonitoredQueue):
436 if isinstance(child, ProcessMonitoredQueue):
418 child_procs.append(child.launcher)
437 child_procs.append(child.launcher)
419 elif isinstance(child, Process):
438 elif isinstance(child, Process):
420 child_procs.append(child)
439 child_procs.append(child)
421 if child_procs:
440 if child_procs:
422 self.log.critical("terminating children...")
441 self.log.critical("terminating children...")
423 for child in child_procs:
442 for child in child_procs:
424 try:
443 try:
425 child.terminate()
444 child.terminate()
426 except OSError:
445 except OSError:
427 # already dead
446 # already dead
428 pass
447 pass
429
448
430 def handle_signal(self, sig, frame):
449 def handle_signal(self, sig, frame):
431 self.log.critical("Received signal %i, shutting down", sig)
450 self.log.critical("Received signal %i, shutting down", sig)
432 self.terminate_children()
451 self.terminate_children()
433 self.loop.stop()
452 self.loop.stop()
434
453
435 def init_signal(self):
454 def init_signal(self):
436 for sig in (SIGINT, SIGABRT, SIGTERM):
455 for sig in (SIGINT, SIGABRT, SIGTERM):
437 signal(sig, self.handle_signal)
456 signal(sig, self.handle_signal)
438
457
439 def do_import_statements(self):
458 def do_import_statements(self):
440 statements = self.import_statements
459 statements = self.import_statements
441 for s in statements:
460 for s in statements:
442 try:
461 try:
443 self.log.msg("Executing statement: '%s'" % s)
462 self.log.msg("Executing statement: '%s'" % s)
444 exec s in globals(), locals()
463 exec s in globals(), locals()
445 except:
464 except:
446 self.log.msg("Error running statement: %s" % s)
465 self.log.msg("Error running statement: %s" % s)
447
466
448 def forward_logging(self):
467 def forward_logging(self):
449 if self.log_url:
468 if self.log_url:
450 self.log.info("Forwarding logging to %s"%self.log_url)
469 self.log.info("Forwarding logging to %s"%self.log_url)
451 context = zmq.Context.instance()
470 context = zmq.Context.instance()
452 lsock = context.socket(zmq.PUB)
471 lsock = context.socket(zmq.PUB)
453 lsock.connect(self.log_url)
472 lsock.connect(self.log_url)
454 handler = PUBHandler(lsock)
473 handler = PUBHandler(lsock)
455 handler.root_topic = 'controller'
474 handler.root_topic = 'controller'
456 handler.setLevel(self.log_level)
475 handler.setLevel(self.log_level)
457 self.log.addHandler(handler)
476 self.log.addHandler(handler)
458
477
459 @catch_config_error
478 @catch_config_error
460 def initialize(self, argv=None):
479 def initialize(self, argv=None):
461 super(IPControllerApp, self).initialize(argv)
480 super(IPControllerApp, self).initialize(argv)
462 self.forward_logging()
481 self.forward_logging()
463 self.load_secondary_config()
482 self.load_secondary_config()
464 self.init_hub()
483 self.init_hub()
465 self.init_schedulers()
484 self.init_schedulers()
466
485
467 def start(self):
486 def start(self):
468 # Start the subprocesses:
487 # Start the subprocesses:
469 self.factory.start()
488 self.factory.start()
470 # children must be started before signals are setup,
489 # children must be started before signals are setup,
471 # otherwise signal-handling will fire multiple times
490 # otherwise signal-handling will fire multiple times
472 for child in self.children:
491 for child in self.children:
473 child.start()
492 child.start()
474 self.init_signal()
493 self.init_signal()
475
494
476 self.write_pid_file(overwrite=True)
495 self.write_pid_file(overwrite=True)
477
496
478 try:
497 try:
479 self.factory.loop.start()
498 self.factory.loop.start()
480 except KeyboardInterrupt:
499 except KeyboardInterrupt:
481 self.log.critical("Interrupted, Exiting...\n")
500 self.log.critical("Interrupted, Exiting...\n")
482 finally:
501 finally:
483 self.cleanup_connection_files()
502 self.cleanup_connection_files()
484
503
485
504
486
505
487 def launch_new_instance():
506 def launch_new_instance():
488 """Create and run the IPython controller"""
507 """Create and run the IPython controller"""
489 if sys.platform == 'win32':
508 if sys.platform == 'win32':
490 # make sure we don't get called from a multiprocessing subprocess
509 # make sure we don't get called from a multiprocessing subprocess
491 # this can result in infinite Controllers being started on Windows
510 # this can result in infinite Controllers being started on Windows
492 # which doesn't have a proper fork, so multiprocessing is wonky
511 # which doesn't have a proper fork, so multiprocessing is wonky
493
512
494 # this only comes up when IPython has been installed using vanilla
513 # this only comes up when IPython has been installed using vanilla
495 # setuptools, and *not* distribute.
514 # setuptools, and *not* distribute.
496 import multiprocessing
515 import multiprocessing
497 p = multiprocessing.current_process()
516 p = multiprocessing.current_process()
498 # the main process has name 'MainProcess'
517 # the main process has name 'MainProcess'
499 # subprocesses will have names like 'Process-1'
518 # subprocesses will have names like 'Process-1'
500 if p.name != 'MainProcess':
519 if p.name != 'MainProcess':
501 # we are a subprocess, don't start another Controller!
520 # we are a subprocess, don't start another Controller!
502 return
521 return
503 app = IPControllerApp.instance()
522 app = IPControllerApp.instance()
504 app.initialize()
523 app.initialize()
505 app.start()
524 app.start()
506
525
507
526
508 if __name__ == '__main__':
527 if __name__ == '__main__':
509 launch_new_instance()
528 launch_new_instance()
@@ -1,1699 +1,1700 b''
1 """A semi-synchronous Client for the ZMQ cluster
1 """A semi-synchronous Client for the ZMQ cluster
2
2
3 Authors:
3 Authors:
4
4
5 * MinRK
5 * MinRK
6 """
6 """
7 #-----------------------------------------------------------------------------
7 #-----------------------------------------------------------------------------
8 # Copyright (C) 2010-2011 The IPython Development Team
8 # Copyright (C) 2010-2011 The IPython Development Team
9 #
9 #
10 # Distributed under the terms of the BSD License. The full license is in
10 # Distributed under the terms of the BSD License. The full license is in
11 # the file COPYING, distributed as part of this software.
11 # the file COPYING, distributed as part of this software.
12 #-----------------------------------------------------------------------------
12 #-----------------------------------------------------------------------------
13
13
14 #-----------------------------------------------------------------------------
14 #-----------------------------------------------------------------------------
15 # Imports
15 # Imports
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17
17
18 import os
18 import os
19 import json
19 import json
20 import sys
20 import sys
21 from threading import Thread, Event
21 from threading import Thread, Event
22 import time
22 import time
23 import warnings
23 import warnings
24 from datetime import datetime
24 from datetime import datetime
25 from getpass import getpass
25 from getpass import getpass
26 from pprint import pprint
26 from pprint import pprint
27
27
28 pjoin = os.path.join
28 pjoin = os.path.join
29
29
30 import zmq
30 import zmq
31 # from zmq.eventloop import ioloop, zmqstream
31 # from zmq.eventloop import ioloop, zmqstream
32
32
33 from IPython.config.configurable import MultipleInstanceError
33 from IPython.config.configurable import MultipleInstanceError
34 from IPython.core.application import BaseIPythonApplication
34 from IPython.core.application import BaseIPythonApplication
35 from IPython.core.profiledir import ProfileDir, ProfileDirError
35 from IPython.core.profiledir import ProfileDir, ProfileDirError
36
36
37 from IPython.utils.coloransi import TermColors
37 from IPython.utils.coloransi import TermColors
38 from IPython.utils.jsonutil import rekey
38 from IPython.utils.jsonutil import rekey
39 from IPython.utils.localinterfaces import LOCAL_IPS
39 from IPython.utils.localinterfaces import LOCAL_IPS
40 from IPython.utils.path import get_ipython_dir
40 from IPython.utils.path import get_ipython_dir
41 from IPython.utils.py3compat import cast_bytes
41 from IPython.utils.py3compat import cast_bytes
42 from IPython.utils.traitlets import (HasTraits, Integer, Instance, Unicode,
42 from IPython.utils.traitlets import (HasTraits, Integer, Instance, Unicode,
43 Dict, List, Bool, Set, Any)
43 Dict, List, Bool, Set, Any)
44 from IPython.external.decorator import decorator
44 from IPython.external.decorator import decorator
45 from IPython.external.ssh import tunnel
45 from IPython.external.ssh import tunnel
46
46
47 from IPython.parallel import Reference
47 from IPython.parallel import Reference
48 from IPython.parallel import error
48 from IPython.parallel import error
49 from IPython.parallel import util
49 from IPython.parallel import util
50
50
51 from IPython.zmq.session import Session, Message
51 from IPython.zmq.session import Session, Message
52
52
53 from .asyncresult import AsyncResult, AsyncHubResult
53 from .asyncresult import AsyncResult, AsyncHubResult
54 from .view import DirectView, LoadBalancedView
54 from .view import DirectView, LoadBalancedView
55
55
56 if sys.version_info[0] >= 3:
56 if sys.version_info[0] >= 3:
57 # xrange is used in a couple 'isinstance' tests in py2
57 # xrange is used in a couple 'isinstance' tests in py2
58 # should be just 'range' in 3k
58 # should be just 'range' in 3k
59 xrange = range
59 xrange = range
60
60
61 #--------------------------------------------------------------------------
61 #--------------------------------------------------------------------------
62 # Decorators for Client methods
62 # Decorators for Client methods
63 #--------------------------------------------------------------------------
63 #--------------------------------------------------------------------------
64
64
65 @decorator
65 @decorator
66 def spin_first(f, self, *args, **kwargs):
66 def spin_first(f, self, *args, **kwargs):
67 """Call spin() to sync state prior to calling the method."""
67 """Call spin() to sync state prior to calling the method."""
68 self.spin()
68 self.spin()
69 return f(self, *args, **kwargs)
69 return f(self, *args, **kwargs)
70
70
71
71
72 #--------------------------------------------------------------------------
72 #--------------------------------------------------------------------------
73 # Classes
73 # Classes
74 #--------------------------------------------------------------------------
74 #--------------------------------------------------------------------------
75
75
76
76
77 class ExecuteReply(object):
77 class ExecuteReply(object):
78 """wrapper for finished Execute results"""
78 """wrapper for finished Execute results"""
79 def __init__(self, msg_id, content, metadata):
79 def __init__(self, msg_id, content, metadata):
80 self.msg_id = msg_id
80 self.msg_id = msg_id
81 self._content = content
81 self._content = content
82 self.execution_count = content['execution_count']
82 self.execution_count = content['execution_count']
83 self.metadata = metadata
83 self.metadata = metadata
84
84
85 def __getitem__(self, key):
85 def __getitem__(self, key):
86 return self.metadata[key]
86 return self.metadata[key]
87
87
88 def __getattr__(self, key):
88 def __getattr__(self, key):
89 if key not in self.metadata:
89 if key not in self.metadata:
90 raise AttributeError(key)
90 raise AttributeError(key)
91 return self.metadata[key]
91 return self.metadata[key]
92
92
93 def __repr__(self):
93 def __repr__(self):
94 pyout = self.metadata['pyout'] or {'data':{}}
94 pyout = self.metadata['pyout'] or {'data':{}}
95 text_out = pyout['data'].get('text/plain', '')
95 text_out = pyout['data'].get('text/plain', '')
96 if len(text_out) > 32:
96 if len(text_out) > 32:
97 text_out = text_out[:29] + '...'
97 text_out = text_out[:29] + '...'
98
98
99 return "<ExecuteReply[%i]: %s>" % (self.execution_count, text_out)
99 return "<ExecuteReply[%i]: %s>" % (self.execution_count, text_out)
100
100
101 def _repr_pretty_(self, p, cycle):
101 def _repr_pretty_(self, p, cycle):
102 pyout = self.metadata['pyout'] or {'data':{}}
102 pyout = self.metadata['pyout'] or {'data':{}}
103 text_out = pyout['data'].get('text/plain', '')
103 text_out = pyout['data'].get('text/plain', '')
104
104
105 if not text_out:
105 if not text_out:
106 return
106 return
107
107
108 try:
108 try:
109 ip = get_ipython()
109 ip = get_ipython()
110 except NameError:
110 except NameError:
111 colors = "NoColor"
111 colors = "NoColor"
112 else:
112 else:
113 colors = ip.colors
113 colors = ip.colors
114
114
115 if colors == "NoColor":
115 if colors == "NoColor":
116 out = normal = ""
116 out = normal = ""
117 else:
117 else:
118 out = TermColors.Red
118 out = TermColors.Red
119 normal = TermColors.Normal
119 normal = TermColors.Normal
120
120
121 if '\n' in text_out and not text_out.startswith('\n'):
121 if '\n' in text_out and not text_out.startswith('\n'):
122 # add newline for multiline reprs
122 # add newline for multiline reprs
123 text_out = '\n' + text_out
123 text_out = '\n' + text_out
124
124
125 p.text(
125 p.text(
126 out + u'Out[%i:%i]: ' % (
126 out + u'Out[%i:%i]: ' % (
127 self.metadata['engine_id'], self.execution_count
127 self.metadata['engine_id'], self.execution_count
128 ) + normal + text_out
128 ) + normal + text_out
129 )
129 )
130
130
131 def _repr_html_(self):
131 def _repr_html_(self):
132 pyout = self.metadata['pyout'] or {'data':{}}
132 pyout = self.metadata['pyout'] or {'data':{}}
133 return pyout['data'].get("text/html")
133 return pyout['data'].get("text/html")
134
134
135 def _repr_latex_(self):
135 def _repr_latex_(self):
136 pyout = self.metadata['pyout'] or {'data':{}}
136 pyout = self.metadata['pyout'] or {'data':{}}
137 return pyout['data'].get("text/latex")
137 return pyout['data'].get("text/latex")
138
138
139 def _repr_json_(self):
139 def _repr_json_(self):
140 pyout = self.metadata['pyout'] or {'data':{}}
140 pyout = self.metadata['pyout'] or {'data':{}}
141 return pyout['data'].get("application/json")
141 return pyout['data'].get("application/json")
142
142
143 def _repr_javascript_(self):
143 def _repr_javascript_(self):
144 pyout = self.metadata['pyout'] or {'data':{}}
144 pyout = self.metadata['pyout'] or {'data':{}}
145 return pyout['data'].get("application/javascript")
145 return pyout['data'].get("application/javascript")
146
146
147 def _repr_png_(self):
147 def _repr_png_(self):
148 pyout = self.metadata['pyout'] or {'data':{}}
148 pyout = self.metadata['pyout'] or {'data':{}}
149 return pyout['data'].get("image/png")
149 return pyout['data'].get("image/png")
150
150
151 def _repr_jpeg_(self):
151 def _repr_jpeg_(self):
152 pyout = self.metadata['pyout'] or {'data':{}}
152 pyout = self.metadata['pyout'] or {'data':{}}
153 return pyout['data'].get("image/jpeg")
153 return pyout['data'].get("image/jpeg")
154
154
155 def _repr_svg_(self):
155 def _repr_svg_(self):
156 pyout = self.metadata['pyout'] or {'data':{}}
156 pyout = self.metadata['pyout'] or {'data':{}}
157 return pyout['data'].get("image/svg+xml")
157 return pyout['data'].get("image/svg+xml")
158
158
159
159
160 class Metadata(dict):
160 class Metadata(dict):
161 """Subclass of dict for initializing metadata values.
161 """Subclass of dict for initializing metadata values.
162
162
163 Attribute access works on keys.
163 Attribute access works on keys.
164
164
165 These objects have a strict set of keys - errors will raise if you try
165 These objects have a strict set of keys - errors will raise if you try
166 to add new keys.
166 to add new keys.
167 """
167 """
168 def __init__(self, *args, **kwargs):
168 def __init__(self, *args, **kwargs):
169 dict.__init__(self)
169 dict.__init__(self)
170 md = {'msg_id' : None,
170 md = {'msg_id' : None,
171 'submitted' : None,
171 'submitted' : None,
172 'started' : None,
172 'started' : None,
173 'completed' : None,
173 'completed' : None,
174 'received' : None,
174 'received' : None,
175 'engine_uuid' : None,
175 'engine_uuid' : None,
176 'engine_id' : None,
176 'engine_id' : None,
177 'follow' : None,
177 'follow' : None,
178 'after' : None,
178 'after' : None,
179 'status' : None,
179 'status' : None,
180
180
181 'pyin' : None,
181 'pyin' : None,
182 'pyout' : None,
182 'pyout' : None,
183 'pyerr' : None,
183 'pyerr' : None,
184 'stdout' : '',
184 'stdout' : '',
185 'stderr' : '',
185 'stderr' : '',
186 'outputs' : [],
186 'outputs' : [],
187 'outputs_ready' : False,
187 'outputs_ready' : False,
188 }
188 }
189 self.update(md)
189 self.update(md)
190 self.update(dict(*args, **kwargs))
190 self.update(dict(*args, **kwargs))
191
191
192 def __getattr__(self, key):
192 def __getattr__(self, key):
193 """getattr aliased to getitem"""
193 """getattr aliased to getitem"""
194 if key in self.iterkeys():
194 if key in self.iterkeys():
195 return self[key]
195 return self[key]
196 else:
196 else:
197 raise AttributeError(key)
197 raise AttributeError(key)
198
198
199 def __setattr__(self, key, value):
199 def __setattr__(self, key, value):
200 """setattr aliased to setitem, with strict"""
200 """setattr aliased to setitem, with strict"""
201 if key in self.iterkeys():
201 if key in self.iterkeys():
202 self[key] = value
202 self[key] = value
203 else:
203 else:
204 raise AttributeError(key)
204 raise AttributeError(key)
205
205
206 def __setitem__(self, key, value):
206 def __setitem__(self, key, value):
207 """strict static key enforcement"""
207 """strict static key enforcement"""
208 if key in self.iterkeys():
208 if key in self.iterkeys():
209 dict.__setitem__(self, key, value)
209 dict.__setitem__(self, key, value)
210 else:
210 else:
211 raise KeyError(key)
211 raise KeyError(key)
212
212
213
213
214 class Client(HasTraits):
214 class Client(HasTraits):
215 """A semi-synchronous client to the IPython ZMQ cluster
215 """A semi-synchronous client to the IPython ZMQ cluster
216
216
217 Parameters
217 Parameters
218 ----------
218 ----------
219
219
220 url_file : str/unicode; path to ipcontroller-client.json
220 url_file : str/unicode; path to ipcontroller-client.json
221 This JSON file should contain all the information needed to connect to a cluster,
221 This JSON file should contain all the information needed to connect to a cluster,
222 and is likely the only argument needed.
222 and is likely the only argument needed.
223 Connection information for the Hub's registration. If a json connector
223 Connection information for the Hub's registration. If a json connector
224 file is given, then likely no further configuration is necessary.
224 file is given, then likely no further configuration is necessary.
225 [Default: use profile]
225 [Default: use profile]
226 profile : bytes
226 profile : bytes
227 The name of the Cluster profile to be used to find connector information.
227 The name of the Cluster profile to be used to find connector information.
228 If run from an IPython application, the default profile will be the same
228 If run from an IPython application, the default profile will be the same
229 as the running application, otherwise it will be 'default'.
229 as the running application, otherwise it will be 'default'.
230 context : zmq.Context
230 context : zmq.Context
231 Pass an existing zmq.Context instance, otherwise the client will create its own.
231 Pass an existing zmq.Context instance, otherwise the client will create its own.
232 debug : bool
232 debug : bool
233 flag for lots of message printing for debug purposes
233 flag for lots of message printing for debug purposes
234 timeout : int/float
234 timeout : int/float
235 time (in seconds) to wait for connection replies from the Hub
235 time (in seconds) to wait for connection replies from the Hub
236 [Default: 10]
236 [Default: 10]
237
237
238 #-------------- session related args ----------------
238 #-------------- session related args ----------------
239
239
240 config : Config object
240 config : Config object
241 If specified, this will be relayed to the Session for configuration
241 If specified, this will be relayed to the Session for configuration
242 username : str
242 username : str
243 set username for the session object
243 set username for the session object
244
244
245 #-------------- ssh related args ----------------
245 #-------------- ssh related args ----------------
246 # These are args for configuring the ssh tunnel to be used
246 # These are args for configuring the ssh tunnel to be used
247 # credentials are used to forward connections over ssh to the Controller
247 # credentials are used to forward connections over ssh to the Controller
248 # Note that the ip given in `addr` needs to be relative to sshserver
248 # Note that the ip given in `addr` needs to be relative to sshserver
249 # The most basic case is to leave addr as pointing to localhost (127.0.0.1),
249 # The most basic case is to leave addr as pointing to localhost (127.0.0.1),
250 # and set sshserver as the same machine the Controller is on. However,
250 # and set sshserver as the same machine the Controller is on. However,
251 # the only requirement is that sshserver is able to see the Controller
251 # the only requirement is that sshserver is able to see the Controller
252 # (i.e. is within the same trusted network).
252 # (i.e. is within the same trusted network).
253
253
254 sshserver : str
254 sshserver : str
255 A string of the form passed to ssh, i.e. 'server.tld' or 'user@server.tld:port'
255 A string of the form passed to ssh, i.e. 'server.tld' or 'user@server.tld:port'
256 If keyfile or password is specified, and this is not, it will default to
256 If keyfile or password is specified, and this is not, it will default to
257 the ip given in addr.
257 the ip given in addr.
258 sshkey : str; path to ssh private key file
258 sshkey : str; path to ssh private key file
259 This specifies a key to be used in ssh login, default None.
259 This specifies a key to be used in ssh login, default None.
260 Regular default ssh keys will be used without specifying this argument.
260 Regular default ssh keys will be used without specifying this argument.
261 password : str
261 password : str
262 Your ssh password to sshserver. Note that if this is left None,
262 Your ssh password to sshserver. Note that if this is left None,
263 you will be prompted for it if passwordless key based login is unavailable.
263 you will be prompted for it if passwordless key based login is unavailable.
264 paramiko : bool
264 paramiko : bool
265 flag for whether to use paramiko instead of shell ssh for tunneling.
265 flag for whether to use paramiko instead of shell ssh for tunneling.
266 [default: True on win32, False else]
266 [default: True on win32, False else]
267
267
268
268
269 Attributes
269 Attributes
270 ----------
270 ----------
271
271
272 ids : list of int engine IDs
272 ids : list of int engine IDs
273 requesting the ids attribute always synchronizes
273 requesting the ids attribute always synchronizes
274 the registration state. To request ids without synchronization,
274 the registration state. To request ids without synchronization,
275 use semi-private _ids attributes.
275 use semi-private _ids attributes.
276
276
277 history : list of msg_ids
277 history : list of msg_ids
278 a list of msg_ids, keeping track of all the execution
278 a list of msg_ids, keeping track of all the execution
279 messages you have submitted in order.
279 messages you have submitted in order.
280
280
281 outstanding : set of msg_ids
281 outstanding : set of msg_ids
282 a set of msg_ids that have been submitted, but whose
282 a set of msg_ids that have been submitted, but whose
283 results have not yet been received.
283 results have not yet been received.
284
284
285 results : dict
285 results : dict
286 a dict of all our results, keyed by msg_id
286 a dict of all our results, keyed by msg_id
287
287
288 block : bool
288 block : bool
289 determines default behavior when block not specified
289 determines default behavior when block not specified
290 in execution methods
290 in execution methods
291
291
292 Methods
292 Methods
293 -------
293 -------
294
294
295 spin
295 spin
296 flushes incoming results and registration state changes
296 flushes incoming results and registration state changes
297 control methods spin, and requesting `ids` also ensures up to date
297 control methods spin, and requesting `ids` also ensures up to date
298
298
299 wait
299 wait
300 wait on one or more msg_ids
300 wait on one or more msg_ids
301
301
302 execution methods
302 execution methods
303 apply
303 apply
304 legacy: execute, run
304 legacy: execute, run
305
305
306 data movement
306 data movement
307 push, pull, scatter, gather
307 push, pull, scatter, gather
308
308
309 query methods
309 query methods
310 queue_status, get_result, purge, result_status
310 queue_status, get_result, purge, result_status
311
311
312 control methods
312 control methods
313 abort, shutdown
313 abort, shutdown
314
314
315 """
315 """
316
316
317
317
318 block = Bool(False)
318 block = Bool(False)
319 outstanding = Set()
319 outstanding = Set()
320 results = Instance('collections.defaultdict', (dict,))
320 results = Instance('collections.defaultdict', (dict,))
321 metadata = Instance('collections.defaultdict', (Metadata,))
321 metadata = Instance('collections.defaultdict', (Metadata,))
322 history = List()
322 history = List()
323 debug = Bool(False)
323 debug = Bool(False)
324 _spin_thread = Any()
324 _spin_thread = Any()
325 _stop_spinning = Any()
325 _stop_spinning = Any()
326
326
327 profile=Unicode()
327 profile=Unicode()
328 def _profile_default(self):
328 def _profile_default(self):
329 if BaseIPythonApplication.initialized():
329 if BaseIPythonApplication.initialized():
330 # an IPython app *might* be running, try to get its profile
330 # an IPython app *might* be running, try to get its profile
331 try:
331 try:
332 return BaseIPythonApplication.instance().profile
332 return BaseIPythonApplication.instance().profile
333 except (AttributeError, MultipleInstanceError):
333 except (AttributeError, MultipleInstanceError):
334 # could be a *different* subclass of config.Application,
334 # could be a *different* subclass of config.Application,
335 # which would raise one of these two errors.
335 # which would raise one of these two errors.
336 return u'default'
336 return u'default'
337 else:
337 else:
338 return u'default'
338 return u'default'
339
339
340
340
341 _outstanding_dict = Instance('collections.defaultdict', (set,))
341 _outstanding_dict = Instance('collections.defaultdict', (set,))
342 _ids = List()
342 _ids = List()
343 _connected=Bool(False)
343 _connected=Bool(False)
344 _ssh=Bool(False)
344 _ssh=Bool(False)
345 _context = Instance('zmq.Context')
345 _context = Instance('zmq.Context')
346 _config = Dict()
346 _config = Dict()
347 _engines=Instance(util.ReverseDict, (), {})
347 _engines=Instance(util.ReverseDict, (), {})
348 # _hub_socket=Instance('zmq.Socket')
348 # _hub_socket=Instance('zmq.Socket')
349 _query_socket=Instance('zmq.Socket')
349 _query_socket=Instance('zmq.Socket')
350 _control_socket=Instance('zmq.Socket')
350 _control_socket=Instance('zmq.Socket')
351 _iopub_socket=Instance('zmq.Socket')
351 _iopub_socket=Instance('zmq.Socket')
352 _notification_socket=Instance('zmq.Socket')
352 _notification_socket=Instance('zmq.Socket')
353 _mux_socket=Instance('zmq.Socket')
353 _mux_socket=Instance('zmq.Socket')
354 _task_socket=Instance('zmq.Socket')
354 _task_socket=Instance('zmq.Socket')
355 _task_scheme=Unicode()
355 _task_scheme=Unicode()
356 _closed = False
356 _closed = False
357 _ignored_control_replies=Integer(0)
357 _ignored_control_replies=Integer(0)
358 _ignored_hub_replies=Integer(0)
358 _ignored_hub_replies=Integer(0)
359
359
360 def __new__(self, *args, **kw):
360 def __new__(self, *args, **kw):
361 # don't raise on positional args
361 # don't raise on positional args
362 return HasTraits.__new__(self, **kw)
362 return HasTraits.__new__(self, **kw)
363
363
364 def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=None,
364 def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=None,
365 context=None, debug=False,
365 context=None, debug=False,
366 sshserver=None, sshkey=None, password=None, paramiko=None,
366 sshserver=None, sshkey=None, password=None, paramiko=None,
367 timeout=10, **extra_args
367 timeout=10, **extra_args
368 ):
368 ):
369 if profile:
369 if profile:
370 super(Client, self).__init__(debug=debug, profile=profile)
370 super(Client, self).__init__(debug=debug, profile=profile)
371 else:
371 else:
372 super(Client, self).__init__(debug=debug)
372 super(Client, self).__init__(debug=debug)
373 if context is None:
373 if context is None:
374 context = zmq.Context.instance()
374 context = zmq.Context.instance()
375 self._context = context
375 self._context = context
376 self._stop_spinning = Event()
376 self._stop_spinning = Event()
377
377
378 if 'url_or_file' in extra_args:
378 if 'url_or_file' in extra_args:
379 url_file = extra_args['url_or_file']
379 url_file = extra_args['url_or_file']
380 warnings.warn("url_or_file arg no longer supported, use url_file", DeprecationWarning)
380 warnings.warn("url_or_file arg no longer supported, use url_file", DeprecationWarning)
381
381
382 if url_file and util.is_url(url_file):
382 if url_file and util.is_url(url_file):
383 raise ValueError("single urls cannot be specified, url-files must be used.")
383 raise ValueError("single urls cannot be specified, url-files must be used.")
384
384
385 self._setup_profile_dir(self.profile, profile_dir, ipython_dir)
385 self._setup_profile_dir(self.profile, profile_dir, ipython_dir)
386
386
387 if self._cd is not None:
387 if self._cd is not None:
388 if url_file is None:
388 if url_file is None:
389 url_file = pjoin(self._cd.security_dir, 'ipcontroller-client.json')
389 url_file = pjoin(self._cd.security_dir, 'ipcontroller-client.json')
390 if url_file is None:
390 if url_file is None:
391 raise ValueError(
391 raise ValueError(
392 "I can't find enough information to connect to a hub!"
392 "I can't find enough information to connect to a hub!"
393 " Please specify at least one of url_file or profile."
393 " Please specify at least one of url_file or profile."
394 )
394 )
395
395
396 with open(url_file) as f:
396 with open(url_file) as f:
397 cfg = json.load(f)
397 cfg = json.load(f)
398
398
399 self._task_scheme = cfg['task_scheme']
399 self._task_scheme = cfg['task_scheme']
400
400
401 # sync defaults from args, json:
401 # sync defaults from args, json:
402 if sshserver:
402 if sshserver:
403 cfg['ssh'] = sshserver
403 cfg['ssh'] = sshserver
404
404
405 location = cfg.setdefault('location', None)
405 location = cfg.setdefault('location', None)
406
406
407 proto,addr = cfg['interface'].split('://')
407 proto,addr = cfg['interface'].split('://')
408 addr = util.disambiguate_ip_address(addr)
408 addr = util.disambiguate_ip_address(addr)
409 cfg['interface'] = "%s://%s" % (proto, addr)
409 cfg['interface'] = "%s://%s" % (proto, addr)
410
410
411 # turn interface,port into full urls:
411 # turn interface,port into full urls:
412 for key in ('control', 'task', 'mux', 'iopub', 'notification', 'registration'):
412 for key in ('control', 'task', 'mux', 'iopub', 'notification', 'registration'):
413 cfg[key] = cfg['interface'] + ':%i' % cfg[key]
413 cfg[key] = cfg['interface'] + ':%i' % cfg[key]
414
414
415 url = cfg['registration']
415 url = cfg['registration']
416
416
417 if location is not None and addr == '127.0.0.1':
417 if location is not None and addr == '127.0.0.1':
418 # location specified, and connection is expected to be local
418 # location specified, and connection is expected to be local
419 if location not in LOCAL_IPS and not sshserver:
419 if location not in LOCAL_IPS and not sshserver:
420 # load ssh from JSON *only* if the controller is not on
420 # load ssh from JSON *only* if the controller is not on
421 # this machine
421 # this machine
422 sshserver=cfg['ssh']
422 sshserver=cfg['ssh']
423 if location not in LOCAL_IPS and not sshserver:
423 if location not in LOCAL_IPS and not sshserver:
424 # warn if no ssh specified, but SSH is probably needed
424 # warn if no ssh specified, but SSH is probably needed
425 # This is only a warning, because the most likely cause
425 # This is only a warning, because the most likely cause
426 # is a local Controller on a laptop whose IP is dynamic
426 # is a local Controller on a laptop whose IP is dynamic
427 warnings.warn("""
427 warnings.warn("""
428 Controller appears to be listening on localhost, but not on this machine.
428 Controller appears to be listening on localhost, but not on this machine.
429 If this is true, you should specify Client(...,sshserver='you@%s')
429 If this is true, you should specify Client(...,sshserver='you@%s')
430 or instruct your controller to listen on an external IP."""%location,
430 or instruct your controller to listen on an external IP."""%location,
431 RuntimeWarning)
431 RuntimeWarning)
432 elif not sshserver:
432 elif not sshserver:
433 # otherwise sync with cfg
433 # otherwise sync with cfg
434 sshserver = cfg['ssh']
434 sshserver = cfg['ssh']
435
435
436 self._config = cfg
436 self._config = cfg
437
437
438 self._ssh = bool(sshserver or sshkey or password)
438 self._ssh = bool(sshserver or sshkey or password)
439 if self._ssh and sshserver is None:
439 if self._ssh and sshserver is None:
440 # default to ssh via localhost
440 # default to ssh via localhost
441 sshserver = addr
441 sshserver = addr
442 if self._ssh and password is None:
442 if self._ssh and password is None:
443 if tunnel.try_passwordless_ssh(sshserver, sshkey, paramiko):
443 if tunnel.try_passwordless_ssh(sshserver, sshkey, paramiko):
444 password=False
444 password=False
445 else:
445 else:
446 password = getpass("SSH Password for %s: "%sshserver)
446 password = getpass("SSH Password for %s: "%sshserver)
447 ssh_kwargs = dict(keyfile=sshkey, password=password, paramiko=paramiko)
447 ssh_kwargs = dict(keyfile=sshkey, password=password, paramiko=paramiko)
448
448
449 # configure and construct the session
449 # configure and construct the session
450 extra_args['packer'] = cfg['pack']
450 extra_args['packer'] = cfg['pack']
451 extra_args['unpacker'] = cfg['unpack']
451 extra_args['unpacker'] = cfg['unpack']
452 extra_args['key'] = cfg['exec_key']
452 extra_args['key'] = cfg['exec_key']
453
453
454 self.session = Session(**extra_args)
454 self.session = Session(**extra_args)
455
455
456 self._query_socket = self._context.socket(zmq.DEALER)
456 self._query_socket = self._context.socket(zmq.DEALER)
457
457
458 if self._ssh:
458 if self._ssh:
459 tunnel.tunnel_connection(self._query_socket, cfg['registration'], sshserver, **ssh_kwargs)
459 tunnel.tunnel_connection(self._query_socket, cfg['registration'], sshserver, **ssh_kwargs)
460 else:
460 else:
461 self._query_socket.connect(cfg['registration'])
461 self._query_socket.connect(cfg['registration'])
462
462
463 self.session.debug = self.debug
463 self.session.debug = self.debug
464
464
465 self._notification_handlers = {'registration_notification' : self._register_engine,
465 self._notification_handlers = {'registration_notification' : self._register_engine,
466 'unregistration_notification' : self._unregister_engine,
466 'unregistration_notification' : self._unregister_engine,
467 'shutdown_notification' : lambda msg: self.close(),
467 'shutdown_notification' : lambda msg: self.close(),
468 }
468 }
469 self._queue_handlers = {'execute_reply' : self._handle_execute_reply,
469 self._queue_handlers = {'execute_reply' : self._handle_execute_reply,
470 'apply_reply' : self._handle_apply_reply}
470 'apply_reply' : self._handle_apply_reply}
471 self._connect(sshserver, ssh_kwargs, timeout)
471 self._connect(sshserver, ssh_kwargs, timeout)
472
472
473 # last step: setup magics, if we are in IPython:
473 # last step: setup magics, if we are in IPython:
474
474
475 try:
475 try:
476 ip = get_ipython()
476 ip = get_ipython()
477 except NameError:
477 except NameError:
478 return
478 return
479 else:
479 else:
480 if 'px' not in ip.magics_manager.magics:
480 if 'px' not in ip.magics_manager.magics:
481 # in IPython but we are the first Client.
481 # in IPython but we are the first Client.
482 # activate a default view for parallel magics.
482 # activate a default view for parallel magics.
483 self.activate()
483 self.activate()
484
484
485 def __del__(self):
485 def __del__(self):
486 """cleanup sockets, but _not_ context."""
486 """cleanup sockets, but _not_ context."""
487 self.close()
487 self.close()
488
488
489 def _setup_profile_dir(self, profile, profile_dir, ipython_dir):
489 def _setup_profile_dir(self, profile, profile_dir, ipython_dir):
490 if ipython_dir is None:
490 if ipython_dir is None:
491 ipython_dir = get_ipython_dir()
491 ipython_dir = get_ipython_dir()
492 if profile_dir is not None:
492 if profile_dir is not None:
493 try:
493 try:
494 self._cd = ProfileDir.find_profile_dir(profile_dir)
494 self._cd = ProfileDir.find_profile_dir(profile_dir)
495 return
495 return
496 except ProfileDirError:
496 except ProfileDirError:
497 pass
497 pass
498 elif profile is not None:
498 elif profile is not None:
499 try:
499 try:
500 self._cd = ProfileDir.find_profile_dir_by_name(
500 self._cd = ProfileDir.find_profile_dir_by_name(
501 ipython_dir, profile)
501 ipython_dir, profile)
502 return
502 return
503 except ProfileDirError:
503 except ProfileDirError:
504 pass
504 pass
505 self._cd = None
505 self._cd = None
506
506
507 def _update_engines(self, engines):
507 def _update_engines(self, engines):
508 """Update our engines dict and _ids from a dict of the form: {id:uuid}."""
508 """Update our engines dict and _ids from a dict of the form: {id:uuid}."""
509 for k,v in engines.iteritems():
509 for k,v in engines.iteritems():
510 eid = int(k)
510 eid = int(k)
511 self._engines[eid] = v
511 if eid not in self._engines:
512 self._ids.append(eid)
512 self._ids.append(eid)
513 self._engines[eid] = v
513 self._ids = sorted(self._ids)
514 self._ids = sorted(self._ids)
514 if sorted(self._engines.keys()) != range(len(self._engines)) and \
515 if sorted(self._engines.keys()) != range(len(self._engines)) and \
515 self._task_scheme == 'pure' and self._task_socket:
516 self._task_scheme == 'pure' and self._task_socket:
516 self._stop_scheduling_tasks()
517 self._stop_scheduling_tasks()
517
518
518 def _stop_scheduling_tasks(self):
519 def _stop_scheduling_tasks(self):
519 """Stop scheduling tasks because an engine has been unregistered
520 """Stop scheduling tasks because an engine has been unregistered
520 from a pure ZMQ scheduler.
521 from a pure ZMQ scheduler.
521 """
522 """
522 self._task_socket.close()
523 self._task_socket.close()
523 self._task_socket = None
524 self._task_socket = None
524 msg = "An engine has been unregistered, and we are using pure " +\
525 msg = "An engine has been unregistered, and we are using pure " +\
525 "ZMQ task scheduling. Task farming will be disabled."
526 "ZMQ task scheduling. Task farming will be disabled."
526 if self.outstanding:
527 if self.outstanding:
527 msg += " If you were running tasks when this happened, " +\
528 msg += " If you were running tasks when this happened, " +\
528 "some `outstanding` msg_ids may never resolve."
529 "some `outstanding` msg_ids may never resolve."
529 warnings.warn(msg, RuntimeWarning)
530 warnings.warn(msg, RuntimeWarning)
530
531
531 def _build_targets(self, targets):
532 def _build_targets(self, targets):
532 """Turn valid target IDs or 'all' into two lists:
533 """Turn valid target IDs or 'all' into two lists:
533 (int_ids, uuids).
534 (int_ids, uuids).
534 """
535 """
535 if not self._ids:
536 if not self._ids:
536 # flush notification socket if no engines yet, just in case
537 # flush notification socket if no engines yet, just in case
537 if not self.ids:
538 if not self.ids:
538 raise error.NoEnginesRegistered("Can't build targets without any engines")
539 raise error.NoEnginesRegistered("Can't build targets without any engines")
539
540
540 if targets is None:
541 if targets is None:
541 targets = self._ids
542 targets = self._ids
542 elif isinstance(targets, basestring):
543 elif isinstance(targets, basestring):
543 if targets.lower() == 'all':
544 if targets.lower() == 'all':
544 targets = self._ids
545 targets = self._ids
545 else:
546 else:
546 raise TypeError("%r not valid str target, must be 'all'"%(targets))
547 raise TypeError("%r not valid str target, must be 'all'"%(targets))
547 elif isinstance(targets, int):
548 elif isinstance(targets, int):
548 if targets < 0:
549 if targets < 0:
549 targets = self.ids[targets]
550 targets = self.ids[targets]
550 if targets not in self._ids:
551 if targets not in self._ids:
551 raise IndexError("No such engine: %i"%targets)
552 raise IndexError("No such engine: %i"%targets)
552 targets = [targets]
553 targets = [targets]
553
554
554 if isinstance(targets, slice):
555 if isinstance(targets, slice):
555 indices = range(len(self._ids))[targets]
556 indices = range(len(self._ids))[targets]
556 ids = self.ids
557 ids = self.ids
557 targets = [ ids[i] for i in indices ]
558 targets = [ ids[i] for i in indices ]
558
559
559 if not isinstance(targets, (tuple, list, xrange)):
560 if not isinstance(targets, (tuple, list, xrange)):
560 raise TypeError("targets by int/slice/collection of ints only, not %s"%(type(targets)))
561 raise TypeError("targets by int/slice/collection of ints only, not %s"%(type(targets)))
561
562
562 return [cast_bytes(self._engines[t]) for t in targets], list(targets)
563 return [cast_bytes(self._engines[t]) for t in targets], list(targets)
563
564
564 def _connect(self, sshserver, ssh_kwargs, timeout):
565 def _connect(self, sshserver, ssh_kwargs, timeout):
565 """setup all our socket connections to the cluster. This is called from
566 """setup all our socket connections to the cluster. This is called from
566 __init__."""
567 __init__."""
567
568
568 # Maybe allow reconnecting?
569 # Maybe allow reconnecting?
569 if self._connected:
570 if self._connected:
570 return
571 return
571 self._connected=True
572 self._connected=True
572
573
573 def connect_socket(s, url):
574 def connect_socket(s, url):
574 # url = util.disambiguate_url(url, self._config['location'])
575 # url = util.disambiguate_url(url, self._config['location'])
575 if self._ssh:
576 if self._ssh:
576 return tunnel.tunnel_connection(s, url, sshserver, **ssh_kwargs)
577 return tunnel.tunnel_connection(s, url, sshserver, **ssh_kwargs)
577 else:
578 else:
578 return s.connect(url)
579 return s.connect(url)
579
580
580 self.session.send(self._query_socket, 'connection_request')
581 self.session.send(self._query_socket, 'connection_request')
581 # use Poller because zmq.select has wrong units in pyzmq 2.1.7
582 # use Poller because zmq.select has wrong units in pyzmq 2.1.7
582 poller = zmq.Poller()
583 poller = zmq.Poller()
583 poller.register(self._query_socket, zmq.POLLIN)
584 poller.register(self._query_socket, zmq.POLLIN)
584 # poll expects milliseconds, timeout is seconds
585 # poll expects milliseconds, timeout is seconds
585 evts = poller.poll(timeout*1000)
586 evts = poller.poll(timeout*1000)
586 if not evts:
587 if not evts:
587 raise error.TimeoutError("Hub connection request timed out")
588 raise error.TimeoutError("Hub connection request timed out")
588 idents,msg = self.session.recv(self._query_socket,mode=0)
589 idents,msg = self.session.recv(self._query_socket,mode=0)
589 if self.debug:
590 if self.debug:
590 pprint(msg)
591 pprint(msg)
591 content = msg['content']
592 content = msg['content']
592 # self._config['registration'] = dict(content)
593 # self._config['registration'] = dict(content)
593 cfg = self._config
594 cfg = self._config
594 if content['status'] == 'ok':
595 if content['status'] == 'ok':
595 self._mux_socket = self._context.socket(zmq.DEALER)
596 self._mux_socket = self._context.socket(zmq.DEALER)
596 connect_socket(self._mux_socket, cfg['mux'])
597 connect_socket(self._mux_socket, cfg['mux'])
597
598
598 self._task_socket = self._context.socket(zmq.DEALER)
599 self._task_socket = self._context.socket(zmq.DEALER)
599 connect_socket(self._task_socket, cfg['task'])
600 connect_socket(self._task_socket, cfg['task'])
600
601
601 self._notification_socket = self._context.socket(zmq.SUB)
602 self._notification_socket = self._context.socket(zmq.SUB)
602 self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'')
603 self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'')
603 connect_socket(self._notification_socket, cfg['notification'])
604 connect_socket(self._notification_socket, cfg['notification'])
604
605
605 self._control_socket = self._context.socket(zmq.DEALER)
606 self._control_socket = self._context.socket(zmq.DEALER)
606 connect_socket(self._control_socket, cfg['control'])
607 connect_socket(self._control_socket, cfg['control'])
607
608
608 self._iopub_socket = self._context.socket(zmq.SUB)
609 self._iopub_socket = self._context.socket(zmq.SUB)
609 self._iopub_socket.setsockopt(zmq.SUBSCRIBE, b'')
610 self._iopub_socket.setsockopt(zmq.SUBSCRIBE, b'')
610 connect_socket(self._iopub_socket, cfg['iopub'])
611 connect_socket(self._iopub_socket, cfg['iopub'])
611
612
612 self._update_engines(dict(content['engines']))
613 self._update_engines(dict(content['engines']))
613 else:
614 else:
614 self._connected = False
615 self._connected = False
615 raise Exception("Failed to connect!")
616 raise Exception("Failed to connect!")
616
617
617 #--------------------------------------------------------------------------
618 #--------------------------------------------------------------------------
618 # handlers and callbacks for incoming messages
619 # handlers and callbacks for incoming messages
619 #--------------------------------------------------------------------------
620 #--------------------------------------------------------------------------
620
621
621 def _unwrap_exception(self, content):
622 def _unwrap_exception(self, content):
622 """unwrap exception, and remap engine_id to int."""
623 """unwrap exception, and remap engine_id to int."""
623 e = error.unwrap_exception(content)
624 e = error.unwrap_exception(content)
624 # print e.traceback
625 # print e.traceback
625 if e.engine_info:
626 if e.engine_info:
626 e_uuid = e.engine_info['engine_uuid']
627 e_uuid = e.engine_info['engine_uuid']
627 eid = self._engines[e_uuid]
628 eid = self._engines[e_uuid]
628 e.engine_info['engine_id'] = eid
629 e.engine_info['engine_id'] = eid
629 return e
630 return e
630
631
631 def _extract_metadata(self, header, parent, content):
632 def _extract_metadata(self, header, parent, content):
632 md = {'msg_id' : parent['msg_id'],
633 md = {'msg_id' : parent['msg_id'],
633 'received' : datetime.now(),
634 'received' : datetime.now(),
634 'engine_uuid' : header.get('engine', None),
635 'engine_uuid' : header.get('engine', None),
635 'follow' : parent.get('follow', []),
636 'follow' : parent.get('follow', []),
636 'after' : parent.get('after', []),
637 'after' : parent.get('after', []),
637 'status' : content['status'],
638 'status' : content['status'],
638 }
639 }
639
640
640 if md['engine_uuid'] is not None:
641 if md['engine_uuid'] is not None:
641 md['engine_id'] = self._engines.get(md['engine_uuid'], None)
642 md['engine_id'] = self._engines.get(md['engine_uuid'], None)
642
643
643 if 'date' in parent:
644 if 'date' in parent:
644 md['submitted'] = parent['date']
645 md['submitted'] = parent['date']
645 if 'started' in header:
646 if 'started' in header:
646 md['started'] = header['started']
647 md['started'] = header['started']
647 if 'date' in header:
648 if 'date' in header:
648 md['completed'] = header['date']
649 md['completed'] = header['date']
649 return md
650 return md
650
651
651 def _register_engine(self, msg):
652 def _register_engine(self, msg):
652 """Register a new engine, and update our connection info."""
653 """Register a new engine, and update our connection info."""
653 content = msg['content']
654 content = msg['content']
654 eid = content['id']
655 eid = content['id']
655 d = {eid : content['queue']}
656 d = {eid : content['uuid']}
656 self._update_engines(d)
657 self._update_engines(d)
657
658
658 def _unregister_engine(self, msg):
659 def _unregister_engine(self, msg):
659 """Unregister an engine that has died."""
660 """Unregister an engine that has died."""
660 content = msg['content']
661 content = msg['content']
661 eid = int(content['id'])
662 eid = int(content['id'])
662 if eid in self._ids:
663 if eid in self._ids:
663 self._ids.remove(eid)
664 self._ids.remove(eid)
664 uuid = self._engines.pop(eid)
665 uuid = self._engines.pop(eid)
665
666
666 self._handle_stranded_msgs(eid, uuid)
667 self._handle_stranded_msgs(eid, uuid)
667
668
668 if self._task_socket and self._task_scheme == 'pure':
669 if self._task_socket and self._task_scheme == 'pure':
669 self._stop_scheduling_tasks()
670 self._stop_scheduling_tasks()
670
671
671 def _handle_stranded_msgs(self, eid, uuid):
672 def _handle_stranded_msgs(self, eid, uuid):
672 """Handle messages known to be on an engine when the engine unregisters.
673 """Handle messages known to be on an engine when the engine unregisters.
673
674
674 It is possible that this will fire prematurely - that is, an engine will
675 It is possible that this will fire prematurely - that is, an engine will
675 go down after completing a result, and the client will be notified
676 go down after completing a result, and the client will be notified
676 of the unregistration and later receive the successful result.
677 of the unregistration and later receive the successful result.
677 """
678 """
678
679
679 outstanding = self._outstanding_dict[uuid]
680 outstanding = self._outstanding_dict[uuid]
680
681
681 for msg_id in list(outstanding):
682 for msg_id in list(outstanding):
682 if msg_id in self.results:
683 if msg_id in self.results:
683 # we already
684 # we already
684 continue
685 continue
685 try:
686 try:
686 raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id))
687 raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id))
687 except:
688 except:
688 content = error.wrap_exception()
689 content = error.wrap_exception()
689 # build a fake message:
690 # build a fake message:
690 parent = {}
691 parent = {}
691 header = {}
692 header = {}
692 parent['msg_id'] = msg_id
693 parent['msg_id'] = msg_id
693 header['engine'] = uuid
694 header['engine'] = uuid
694 header['date'] = datetime.now()
695 header['date'] = datetime.now()
695 msg = dict(parent_header=parent, header=header, content=content)
696 msg = dict(parent_header=parent, header=header, content=content)
696 self._handle_apply_reply(msg)
697 self._handle_apply_reply(msg)
697
698
698 def _handle_execute_reply(self, msg):
699 def _handle_execute_reply(self, msg):
699 """Save the reply to an execute_request into our results.
700 """Save the reply to an execute_request into our results.
700
701
701 execute messages are never actually used. apply is used instead.
702 execute messages are never actually used. apply is used instead.
702 """
703 """
703
704
704 parent = msg['parent_header']
705 parent = msg['parent_header']
705 msg_id = parent['msg_id']
706 msg_id = parent['msg_id']
706 if msg_id not in self.outstanding:
707 if msg_id not in self.outstanding:
707 if msg_id in self.history:
708 if msg_id in self.history:
708 print ("got stale result: %s"%msg_id)
709 print ("got stale result: %s"%msg_id)
709 else:
710 else:
710 print ("got unknown result: %s"%msg_id)
711 print ("got unknown result: %s"%msg_id)
711 else:
712 else:
712 self.outstanding.remove(msg_id)
713 self.outstanding.remove(msg_id)
713
714
714 content = msg['content']
715 content = msg['content']
715 header = msg['header']
716 header = msg['header']
716
717
717 # construct metadata:
718 # construct metadata:
718 md = self.metadata[msg_id]
719 md = self.metadata[msg_id]
719 md.update(self._extract_metadata(header, parent, content))
720 md.update(self._extract_metadata(header, parent, content))
720 # is this redundant?
721 # is this redundant?
721 self.metadata[msg_id] = md
722 self.metadata[msg_id] = md
722
723
723 e_outstanding = self._outstanding_dict[md['engine_uuid']]
724 e_outstanding = self._outstanding_dict[md['engine_uuid']]
724 if msg_id in e_outstanding:
725 if msg_id in e_outstanding:
725 e_outstanding.remove(msg_id)
726 e_outstanding.remove(msg_id)
726
727
727 # construct result:
728 # construct result:
728 if content['status'] == 'ok':
729 if content['status'] == 'ok':
729 self.results[msg_id] = ExecuteReply(msg_id, content, md)
730 self.results[msg_id] = ExecuteReply(msg_id, content, md)
730 elif content['status'] == 'aborted':
731 elif content['status'] == 'aborted':
731 self.results[msg_id] = error.TaskAborted(msg_id)
732 self.results[msg_id] = error.TaskAborted(msg_id)
732 elif content['status'] == 'resubmitted':
733 elif content['status'] == 'resubmitted':
733 # TODO: handle resubmission
734 # TODO: handle resubmission
734 pass
735 pass
735 else:
736 else:
736 self.results[msg_id] = self._unwrap_exception(content)
737 self.results[msg_id] = self._unwrap_exception(content)
737
738
738 def _handle_apply_reply(self, msg):
739 def _handle_apply_reply(self, msg):
739 """Save the reply to an apply_request into our results."""
740 """Save the reply to an apply_request into our results."""
740 parent = msg['parent_header']
741 parent = msg['parent_header']
741 msg_id = parent['msg_id']
742 msg_id = parent['msg_id']
742 if msg_id not in self.outstanding:
743 if msg_id not in self.outstanding:
743 if msg_id in self.history:
744 if msg_id in self.history:
744 print ("got stale result: %s"%msg_id)
745 print ("got stale result: %s"%msg_id)
745 print self.results[msg_id]
746 print self.results[msg_id]
746 print msg
747 print msg
747 else:
748 else:
748 print ("got unknown result: %s"%msg_id)
749 print ("got unknown result: %s"%msg_id)
749 else:
750 else:
750 self.outstanding.remove(msg_id)
751 self.outstanding.remove(msg_id)
751 content = msg['content']
752 content = msg['content']
752 header = msg['header']
753 header = msg['header']
753
754
754 # construct metadata:
755 # construct metadata:
755 md = self.metadata[msg_id]
756 md = self.metadata[msg_id]
756 md.update(self._extract_metadata(header, parent, content))
757 md.update(self._extract_metadata(header, parent, content))
757 # is this redundant?
758 # is this redundant?
758 self.metadata[msg_id] = md
759 self.metadata[msg_id] = md
759
760
760 e_outstanding = self._outstanding_dict[md['engine_uuid']]
761 e_outstanding = self._outstanding_dict[md['engine_uuid']]
761 if msg_id in e_outstanding:
762 if msg_id in e_outstanding:
762 e_outstanding.remove(msg_id)
763 e_outstanding.remove(msg_id)
763
764
764 # construct result:
765 # construct result:
765 if content['status'] == 'ok':
766 if content['status'] == 'ok':
766 self.results[msg_id] = util.unserialize_object(msg['buffers'])[0]
767 self.results[msg_id] = util.unserialize_object(msg['buffers'])[0]
767 elif content['status'] == 'aborted':
768 elif content['status'] == 'aborted':
768 self.results[msg_id] = error.TaskAborted(msg_id)
769 self.results[msg_id] = error.TaskAborted(msg_id)
769 elif content['status'] == 'resubmitted':
770 elif content['status'] == 'resubmitted':
770 # TODO: handle resubmission
771 # TODO: handle resubmission
771 pass
772 pass
772 else:
773 else:
773 self.results[msg_id] = self._unwrap_exception(content)
774 self.results[msg_id] = self._unwrap_exception(content)
774
775
775 def _flush_notifications(self):
776 def _flush_notifications(self):
776 """Flush notifications of engine registrations waiting
777 """Flush notifications of engine registrations waiting
777 in ZMQ queue."""
778 in ZMQ queue."""
778 idents,msg = self.session.recv(self._notification_socket, mode=zmq.NOBLOCK)
779 idents,msg = self.session.recv(self._notification_socket, mode=zmq.NOBLOCK)
779 while msg is not None:
780 while msg is not None:
780 if self.debug:
781 if self.debug:
781 pprint(msg)
782 pprint(msg)
782 msg_type = msg['header']['msg_type']
783 msg_type = msg['header']['msg_type']
783 handler = self._notification_handlers.get(msg_type, None)
784 handler = self._notification_handlers.get(msg_type, None)
784 if handler is None:
785 if handler is None:
785 raise Exception("Unhandled message type: %s"%msg.msg_type)
786 raise Exception("Unhandled message type: %s"%msg.msg_type)
786 else:
787 else:
787 handler(msg)
788 handler(msg)
788 idents,msg = self.session.recv(self._notification_socket, mode=zmq.NOBLOCK)
789 idents,msg = self.session.recv(self._notification_socket, mode=zmq.NOBLOCK)
789
790
790 def _flush_results(self, sock):
791 def _flush_results(self, sock):
791 """Flush task or queue results waiting in ZMQ queue."""
792 """Flush task or queue results waiting in ZMQ queue."""
792 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
793 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
793 while msg is not None:
794 while msg is not None:
794 if self.debug:
795 if self.debug:
795 pprint(msg)
796 pprint(msg)
796 msg_type = msg['header']['msg_type']
797 msg_type = msg['header']['msg_type']
797 handler = self._queue_handlers.get(msg_type, None)
798 handler = self._queue_handlers.get(msg_type, None)
798 if handler is None:
799 if handler is None:
799 raise Exception("Unhandled message type: %s"%msg.msg_type)
800 raise Exception("Unhandled message type: %s"%msg.msg_type)
800 else:
801 else:
801 handler(msg)
802 handler(msg)
802 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
803 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
803
804
804 def _flush_control(self, sock):
805 def _flush_control(self, sock):
805 """Flush replies from the control channel waiting
806 """Flush replies from the control channel waiting
806 in the ZMQ queue.
807 in the ZMQ queue.
807
808
808 Currently: ignore them."""
809 Currently: ignore them."""
809 if self._ignored_control_replies <= 0:
810 if self._ignored_control_replies <= 0:
810 return
811 return
811 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
812 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
812 while msg is not None:
813 while msg is not None:
813 self._ignored_control_replies -= 1
814 self._ignored_control_replies -= 1
814 if self.debug:
815 if self.debug:
815 pprint(msg)
816 pprint(msg)
816 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
817 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
817
818
818 def _flush_ignored_control(self):
819 def _flush_ignored_control(self):
819 """flush ignored control replies"""
820 """flush ignored control replies"""
820 while self._ignored_control_replies > 0:
821 while self._ignored_control_replies > 0:
821 self.session.recv(self._control_socket)
822 self.session.recv(self._control_socket)
822 self._ignored_control_replies -= 1
823 self._ignored_control_replies -= 1
823
824
824 def _flush_ignored_hub_replies(self):
825 def _flush_ignored_hub_replies(self):
825 ident,msg = self.session.recv(self._query_socket, mode=zmq.NOBLOCK)
826 ident,msg = self.session.recv(self._query_socket, mode=zmq.NOBLOCK)
826 while msg is not None:
827 while msg is not None:
827 ident,msg = self.session.recv(self._query_socket, mode=zmq.NOBLOCK)
828 ident,msg = self.session.recv(self._query_socket, mode=zmq.NOBLOCK)
828
829
829 def _flush_iopub(self, sock):
830 def _flush_iopub(self, sock):
830 """Flush replies from the iopub channel waiting
831 """Flush replies from the iopub channel waiting
831 in the ZMQ queue.
832 in the ZMQ queue.
832 """
833 """
833 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
834 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
834 while msg is not None:
835 while msg is not None:
835 if self.debug:
836 if self.debug:
836 pprint(msg)
837 pprint(msg)
837 parent = msg['parent_header']
838 parent = msg['parent_header']
838 # ignore IOPub messages with no parent.
839 # ignore IOPub messages with no parent.
839 # Caused by print statements or warnings from before the first execution.
840 # Caused by print statements or warnings from before the first execution.
840 if not parent:
841 if not parent:
841 continue
842 continue
842 msg_id = parent['msg_id']
843 msg_id = parent['msg_id']
843 content = msg['content']
844 content = msg['content']
844 header = msg['header']
845 header = msg['header']
845 msg_type = msg['header']['msg_type']
846 msg_type = msg['header']['msg_type']
846
847
847 # init metadata:
848 # init metadata:
848 md = self.metadata[msg_id]
849 md = self.metadata[msg_id]
849
850
850 if msg_type == 'stream':
851 if msg_type == 'stream':
851 name = content['name']
852 name = content['name']
852 s = md[name] or ''
853 s = md[name] or ''
853 md[name] = s + content['data']
854 md[name] = s + content['data']
854 elif msg_type == 'pyerr':
855 elif msg_type == 'pyerr':
855 md.update({'pyerr' : self._unwrap_exception(content)})
856 md.update({'pyerr' : self._unwrap_exception(content)})
856 elif msg_type == 'pyin':
857 elif msg_type == 'pyin':
857 md.update({'pyin' : content['code']})
858 md.update({'pyin' : content['code']})
858 elif msg_type == 'display_data':
859 elif msg_type == 'display_data':
859 md['outputs'].append(content)
860 md['outputs'].append(content)
860 elif msg_type == 'pyout':
861 elif msg_type == 'pyout':
861 md['pyout'] = content
862 md['pyout'] = content
862 elif msg_type == 'status':
863 elif msg_type == 'status':
863 # idle message comes after all outputs
864 # idle message comes after all outputs
864 if content['execution_state'] == 'idle':
865 if content['execution_state'] == 'idle':
865 md['outputs_ready'] = True
866 md['outputs_ready'] = True
866 else:
867 else:
867 # unhandled msg_type (status, etc.)
868 # unhandled msg_type (status, etc.)
868 pass
869 pass
869
870
870 # reduntant?
871 # reduntant?
871 self.metadata[msg_id] = md
872 self.metadata[msg_id] = md
872
873
873 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
874 idents,msg = self.session.recv(sock, mode=zmq.NOBLOCK)
874
875
875 #--------------------------------------------------------------------------
876 #--------------------------------------------------------------------------
876 # len, getitem
877 # len, getitem
877 #--------------------------------------------------------------------------
878 #--------------------------------------------------------------------------
878
879
879 def __len__(self):
880 def __len__(self):
880 """len(client) returns # of engines."""
881 """len(client) returns # of engines."""
881 return len(self.ids)
882 return len(self.ids)
882
883
883 def __getitem__(self, key):
884 def __getitem__(self, key):
884 """index access returns DirectView multiplexer objects
885 """index access returns DirectView multiplexer objects
885
886
886 Must be int, slice, or list/tuple/xrange of ints"""
887 Must be int, slice, or list/tuple/xrange of ints"""
887 if not isinstance(key, (int, slice, tuple, list, xrange)):
888 if not isinstance(key, (int, slice, tuple, list, xrange)):
888 raise TypeError("key by int/slice/iterable of ints only, not %s"%(type(key)))
889 raise TypeError("key by int/slice/iterable of ints only, not %s"%(type(key)))
889 else:
890 else:
890 return self.direct_view(key)
891 return self.direct_view(key)
891
892
892 #--------------------------------------------------------------------------
893 #--------------------------------------------------------------------------
893 # Begin public methods
894 # Begin public methods
894 #--------------------------------------------------------------------------
895 #--------------------------------------------------------------------------
895
896
896 @property
897 @property
897 def ids(self):
898 def ids(self):
898 """Always up-to-date ids property."""
899 """Always up-to-date ids property."""
899 self._flush_notifications()
900 self._flush_notifications()
900 # always copy:
901 # always copy:
901 return list(self._ids)
902 return list(self._ids)
902
903
903 def activate(self, targets='all', suffix=''):
904 def activate(self, targets='all', suffix=''):
904 """Create a DirectView and register it with IPython magics
905 """Create a DirectView and register it with IPython magics
905
906
906 Defines the magics `%px, %autopx, %pxresult, %%px`
907 Defines the magics `%px, %autopx, %pxresult, %%px`
907
908
908 Parameters
909 Parameters
909 ----------
910 ----------
910
911
911 targets: int, list of ints, or 'all'
912 targets: int, list of ints, or 'all'
912 The engines on which the view's magics will run
913 The engines on which the view's magics will run
913 suffix: str [default: '']
914 suffix: str [default: '']
914 The suffix, if any, for the magics. This allows you to have
915 The suffix, if any, for the magics. This allows you to have
915 multiple views associated with parallel magics at the same time.
916 multiple views associated with parallel magics at the same time.
916
917
917 e.g. ``rc.activate(targets=0, suffix='0')`` will give you
918 e.g. ``rc.activate(targets=0, suffix='0')`` will give you
918 the magics ``%px0``, ``%pxresult0``, etc. for running magics just
919 the magics ``%px0``, ``%pxresult0``, etc. for running magics just
919 on engine 0.
920 on engine 0.
920 """
921 """
921 view = self.direct_view(targets)
922 view = self.direct_view(targets)
922 view.block = True
923 view.block = True
923 view.activate(suffix)
924 view.activate(suffix)
924 return view
925 return view
925
926
926 def close(self):
927 def close(self):
927 if self._closed:
928 if self._closed:
928 return
929 return
929 self.stop_spin_thread()
930 self.stop_spin_thread()
930 snames = filter(lambda n: n.endswith('socket'), dir(self))
931 snames = filter(lambda n: n.endswith('socket'), dir(self))
931 for socket in map(lambda name: getattr(self, name), snames):
932 for socket in map(lambda name: getattr(self, name), snames):
932 if isinstance(socket, zmq.Socket) and not socket.closed:
933 if isinstance(socket, zmq.Socket) and not socket.closed:
933 socket.close()
934 socket.close()
934 self._closed = True
935 self._closed = True
935
936
936 def _spin_every(self, interval=1):
937 def _spin_every(self, interval=1):
937 """target func for use in spin_thread"""
938 """target func for use in spin_thread"""
938 while True:
939 while True:
939 if self._stop_spinning.is_set():
940 if self._stop_spinning.is_set():
940 return
941 return
941 time.sleep(interval)
942 time.sleep(interval)
942 self.spin()
943 self.spin()
943
944
944 def spin_thread(self, interval=1):
945 def spin_thread(self, interval=1):
945 """call Client.spin() in a background thread on some regular interval
946 """call Client.spin() in a background thread on some regular interval
946
947
947 This helps ensure that messages don't pile up too much in the zmq queue
948 This helps ensure that messages don't pile up too much in the zmq queue
948 while you are working on other things, or just leaving an idle terminal.
949 while you are working on other things, or just leaving an idle terminal.
949
950
950 It also helps limit potential padding of the `received` timestamp
951 It also helps limit potential padding of the `received` timestamp
951 on AsyncResult objects, used for timings.
952 on AsyncResult objects, used for timings.
952
953
953 Parameters
954 Parameters
954 ----------
955 ----------
955
956
956 interval : float, optional
957 interval : float, optional
957 The interval on which to spin the client in the background thread
958 The interval on which to spin the client in the background thread
958 (simply passed to time.sleep).
959 (simply passed to time.sleep).
959
960
960 Notes
961 Notes
961 -----
962 -----
962
963
963 For precision timing, you may want to use this method to put a bound
964 For precision timing, you may want to use this method to put a bound
964 on the jitter (in seconds) in `received` timestamps used
965 on the jitter (in seconds) in `received` timestamps used
965 in AsyncResult.wall_time.
966 in AsyncResult.wall_time.
966
967
967 """
968 """
968 if self._spin_thread is not None:
969 if self._spin_thread is not None:
969 self.stop_spin_thread()
970 self.stop_spin_thread()
970 self._stop_spinning.clear()
971 self._stop_spinning.clear()
971 self._spin_thread = Thread(target=self._spin_every, args=(interval,))
972 self._spin_thread = Thread(target=self._spin_every, args=(interval,))
972 self._spin_thread.daemon = True
973 self._spin_thread.daemon = True
973 self._spin_thread.start()
974 self._spin_thread.start()
974
975
975 def stop_spin_thread(self):
976 def stop_spin_thread(self):
976 """stop background spin_thread, if any"""
977 """stop background spin_thread, if any"""
977 if self._spin_thread is not None:
978 if self._spin_thread is not None:
978 self._stop_spinning.set()
979 self._stop_spinning.set()
979 self._spin_thread.join()
980 self._spin_thread.join()
980 self._spin_thread = None
981 self._spin_thread = None
981
982
982 def spin(self):
983 def spin(self):
983 """Flush any registration notifications and execution results
984 """Flush any registration notifications and execution results
984 waiting in the ZMQ queue.
985 waiting in the ZMQ queue.
985 """
986 """
986 if self._notification_socket:
987 if self._notification_socket:
987 self._flush_notifications()
988 self._flush_notifications()
988 if self._iopub_socket:
989 if self._iopub_socket:
989 self._flush_iopub(self._iopub_socket)
990 self._flush_iopub(self._iopub_socket)
990 if self._mux_socket:
991 if self._mux_socket:
991 self._flush_results(self._mux_socket)
992 self._flush_results(self._mux_socket)
992 if self._task_socket:
993 if self._task_socket:
993 self._flush_results(self._task_socket)
994 self._flush_results(self._task_socket)
994 if self._control_socket:
995 if self._control_socket:
995 self._flush_control(self._control_socket)
996 self._flush_control(self._control_socket)
996 if self._query_socket:
997 if self._query_socket:
997 self._flush_ignored_hub_replies()
998 self._flush_ignored_hub_replies()
998
999
999 def wait(self, jobs=None, timeout=-1):
1000 def wait(self, jobs=None, timeout=-1):
1000 """waits on one or more `jobs`, for up to `timeout` seconds.
1001 """waits on one or more `jobs`, for up to `timeout` seconds.
1001
1002
1002 Parameters
1003 Parameters
1003 ----------
1004 ----------
1004
1005
1005 jobs : int, str, or list of ints and/or strs, or one or more AsyncResult objects
1006 jobs : int, str, or list of ints and/or strs, or one or more AsyncResult objects
1006 ints are indices to self.history
1007 ints are indices to self.history
1007 strs are msg_ids
1008 strs are msg_ids
1008 default: wait on all outstanding messages
1009 default: wait on all outstanding messages
1009 timeout : float
1010 timeout : float
1010 a time in seconds, after which to give up.
1011 a time in seconds, after which to give up.
1011 default is -1, which means no timeout
1012 default is -1, which means no timeout
1012
1013
1013 Returns
1014 Returns
1014 -------
1015 -------
1015
1016
1016 True : when all msg_ids are done
1017 True : when all msg_ids are done
1017 False : timeout reached, some msg_ids still outstanding
1018 False : timeout reached, some msg_ids still outstanding
1018 """
1019 """
1019 tic = time.time()
1020 tic = time.time()
1020 if jobs is None:
1021 if jobs is None:
1021 theids = self.outstanding
1022 theids = self.outstanding
1022 else:
1023 else:
1023 if isinstance(jobs, (int, basestring, AsyncResult)):
1024 if isinstance(jobs, (int, basestring, AsyncResult)):
1024 jobs = [jobs]
1025 jobs = [jobs]
1025 theids = set()
1026 theids = set()
1026 for job in jobs:
1027 for job in jobs:
1027 if isinstance(job, int):
1028 if isinstance(job, int):
1028 # index access
1029 # index access
1029 job = self.history[job]
1030 job = self.history[job]
1030 elif isinstance(job, AsyncResult):
1031 elif isinstance(job, AsyncResult):
1031 map(theids.add, job.msg_ids)
1032 map(theids.add, job.msg_ids)
1032 continue
1033 continue
1033 theids.add(job)
1034 theids.add(job)
1034 if not theids.intersection(self.outstanding):
1035 if not theids.intersection(self.outstanding):
1035 return True
1036 return True
1036 self.spin()
1037 self.spin()
1037 while theids.intersection(self.outstanding):
1038 while theids.intersection(self.outstanding):
1038 if timeout >= 0 and ( time.time()-tic ) > timeout:
1039 if timeout >= 0 and ( time.time()-tic ) > timeout:
1039 break
1040 break
1040 time.sleep(1e-3)
1041 time.sleep(1e-3)
1041 self.spin()
1042 self.spin()
1042 return len(theids.intersection(self.outstanding)) == 0
1043 return len(theids.intersection(self.outstanding)) == 0
1043
1044
1044 #--------------------------------------------------------------------------
1045 #--------------------------------------------------------------------------
1045 # Control methods
1046 # Control methods
1046 #--------------------------------------------------------------------------
1047 #--------------------------------------------------------------------------
1047
1048
1048 @spin_first
1049 @spin_first
1049 def clear(self, targets=None, block=None):
1050 def clear(self, targets=None, block=None):
1050 """Clear the namespace in target(s)."""
1051 """Clear the namespace in target(s)."""
1051 block = self.block if block is None else block
1052 block = self.block if block is None else block
1052 targets = self._build_targets(targets)[0]
1053 targets = self._build_targets(targets)[0]
1053 for t in targets:
1054 for t in targets:
1054 self.session.send(self._control_socket, 'clear_request', content={}, ident=t)
1055 self.session.send(self._control_socket, 'clear_request', content={}, ident=t)
1055 error = False
1056 error = False
1056 if block:
1057 if block:
1057 self._flush_ignored_control()
1058 self._flush_ignored_control()
1058 for i in range(len(targets)):
1059 for i in range(len(targets)):
1059 idents,msg = self.session.recv(self._control_socket,0)
1060 idents,msg = self.session.recv(self._control_socket,0)
1060 if self.debug:
1061 if self.debug:
1061 pprint(msg)
1062 pprint(msg)
1062 if msg['content']['status'] != 'ok':
1063 if msg['content']['status'] != 'ok':
1063 error = self._unwrap_exception(msg['content'])
1064 error = self._unwrap_exception(msg['content'])
1064 else:
1065 else:
1065 self._ignored_control_replies += len(targets)
1066 self._ignored_control_replies += len(targets)
1066 if error:
1067 if error:
1067 raise error
1068 raise error
1068
1069
1069
1070
1070 @spin_first
1071 @spin_first
1071 def abort(self, jobs=None, targets=None, block=None):
1072 def abort(self, jobs=None, targets=None, block=None):
1072 """Abort specific jobs from the execution queues of target(s).
1073 """Abort specific jobs from the execution queues of target(s).
1073
1074
1074 This is a mechanism to prevent jobs that have already been submitted
1075 This is a mechanism to prevent jobs that have already been submitted
1075 from executing.
1076 from executing.
1076
1077
1077 Parameters
1078 Parameters
1078 ----------
1079 ----------
1079
1080
1080 jobs : msg_id, list of msg_ids, or AsyncResult
1081 jobs : msg_id, list of msg_ids, or AsyncResult
1081 The jobs to be aborted
1082 The jobs to be aborted
1082
1083
1083 If unspecified/None: abort all outstanding jobs.
1084 If unspecified/None: abort all outstanding jobs.
1084
1085
1085 """
1086 """
1086 block = self.block if block is None else block
1087 block = self.block if block is None else block
1087 jobs = jobs if jobs is not None else list(self.outstanding)
1088 jobs = jobs if jobs is not None else list(self.outstanding)
1088 targets = self._build_targets(targets)[0]
1089 targets = self._build_targets(targets)[0]
1089
1090
1090 msg_ids = []
1091 msg_ids = []
1091 if isinstance(jobs, (basestring,AsyncResult)):
1092 if isinstance(jobs, (basestring,AsyncResult)):
1092 jobs = [jobs]
1093 jobs = [jobs]
1093 bad_ids = filter(lambda obj: not isinstance(obj, (basestring, AsyncResult)), jobs)
1094 bad_ids = filter(lambda obj: not isinstance(obj, (basestring, AsyncResult)), jobs)
1094 if bad_ids:
1095 if bad_ids:
1095 raise TypeError("Invalid msg_id type %r, expected str or AsyncResult"%bad_ids[0])
1096 raise TypeError("Invalid msg_id type %r, expected str or AsyncResult"%bad_ids[0])
1096 for j in jobs:
1097 for j in jobs:
1097 if isinstance(j, AsyncResult):
1098 if isinstance(j, AsyncResult):
1098 msg_ids.extend(j.msg_ids)
1099 msg_ids.extend(j.msg_ids)
1099 else:
1100 else:
1100 msg_ids.append(j)
1101 msg_ids.append(j)
1101 content = dict(msg_ids=msg_ids)
1102 content = dict(msg_ids=msg_ids)
1102 for t in targets:
1103 for t in targets:
1103 self.session.send(self._control_socket, 'abort_request',
1104 self.session.send(self._control_socket, 'abort_request',
1104 content=content, ident=t)
1105 content=content, ident=t)
1105 error = False
1106 error = False
1106 if block:
1107 if block:
1107 self._flush_ignored_control()
1108 self._flush_ignored_control()
1108 for i in range(len(targets)):
1109 for i in range(len(targets)):
1109 idents,msg = self.session.recv(self._control_socket,0)
1110 idents,msg = self.session.recv(self._control_socket,0)
1110 if self.debug:
1111 if self.debug:
1111 pprint(msg)
1112 pprint(msg)
1112 if msg['content']['status'] != 'ok':
1113 if msg['content']['status'] != 'ok':
1113 error = self._unwrap_exception(msg['content'])
1114 error = self._unwrap_exception(msg['content'])
1114 else:
1115 else:
1115 self._ignored_control_replies += len(targets)
1116 self._ignored_control_replies += len(targets)
1116 if error:
1117 if error:
1117 raise error
1118 raise error
1118
1119
1119 @spin_first
1120 @spin_first
1120 def shutdown(self, targets='all', restart=False, hub=False, block=None):
1121 def shutdown(self, targets='all', restart=False, hub=False, block=None):
1121 """Terminates one or more engine processes, optionally including the hub.
1122 """Terminates one or more engine processes, optionally including the hub.
1122
1123
1123 Parameters
1124 Parameters
1124 ----------
1125 ----------
1125
1126
1126 targets: list of ints or 'all' [default: all]
1127 targets: list of ints or 'all' [default: all]
1127 Which engines to shutdown.
1128 Which engines to shutdown.
1128 hub: bool [default: False]
1129 hub: bool [default: False]
1129 Whether to include the Hub. hub=True implies targets='all'.
1130 Whether to include the Hub. hub=True implies targets='all'.
1130 block: bool [default: self.block]
1131 block: bool [default: self.block]
1131 Whether to wait for clean shutdown replies or not.
1132 Whether to wait for clean shutdown replies or not.
1132 restart: bool [default: False]
1133 restart: bool [default: False]
1133 NOT IMPLEMENTED
1134 NOT IMPLEMENTED
1134 whether to restart engines after shutting them down.
1135 whether to restart engines after shutting them down.
1135 """
1136 """
1136
1137
1137 if restart:
1138 if restart:
1138 raise NotImplementedError("Engine restart is not yet implemented")
1139 raise NotImplementedError("Engine restart is not yet implemented")
1139
1140
1140 block = self.block if block is None else block
1141 block = self.block if block is None else block
1141 if hub:
1142 if hub:
1142 targets = 'all'
1143 targets = 'all'
1143 targets = self._build_targets(targets)[0]
1144 targets = self._build_targets(targets)[0]
1144 for t in targets:
1145 for t in targets:
1145 self.session.send(self._control_socket, 'shutdown_request',
1146 self.session.send(self._control_socket, 'shutdown_request',
1146 content={'restart':restart},ident=t)
1147 content={'restart':restart},ident=t)
1147 error = False
1148 error = False
1148 if block or hub:
1149 if block or hub:
1149 self._flush_ignored_control()
1150 self._flush_ignored_control()
1150 for i in range(len(targets)):
1151 for i in range(len(targets)):
1151 idents,msg = self.session.recv(self._control_socket, 0)
1152 idents,msg = self.session.recv(self._control_socket, 0)
1152 if self.debug:
1153 if self.debug:
1153 pprint(msg)
1154 pprint(msg)
1154 if msg['content']['status'] != 'ok':
1155 if msg['content']['status'] != 'ok':
1155 error = self._unwrap_exception(msg['content'])
1156 error = self._unwrap_exception(msg['content'])
1156 else:
1157 else:
1157 self._ignored_control_replies += len(targets)
1158 self._ignored_control_replies += len(targets)
1158
1159
1159 if hub:
1160 if hub:
1160 time.sleep(0.25)
1161 time.sleep(0.25)
1161 self.session.send(self._query_socket, 'shutdown_request')
1162 self.session.send(self._query_socket, 'shutdown_request')
1162 idents,msg = self.session.recv(self._query_socket, 0)
1163 idents,msg = self.session.recv(self._query_socket, 0)
1163 if self.debug:
1164 if self.debug:
1164 pprint(msg)
1165 pprint(msg)
1165 if msg['content']['status'] != 'ok':
1166 if msg['content']['status'] != 'ok':
1166 error = self._unwrap_exception(msg['content'])
1167 error = self._unwrap_exception(msg['content'])
1167
1168
1168 if error:
1169 if error:
1169 raise error
1170 raise error
1170
1171
1171 #--------------------------------------------------------------------------
1172 #--------------------------------------------------------------------------
1172 # Execution related methods
1173 # Execution related methods
1173 #--------------------------------------------------------------------------
1174 #--------------------------------------------------------------------------
1174
1175
1175 def _maybe_raise(self, result):
1176 def _maybe_raise(self, result):
1176 """wrapper for maybe raising an exception if apply failed."""
1177 """wrapper for maybe raising an exception if apply failed."""
1177 if isinstance(result, error.RemoteError):
1178 if isinstance(result, error.RemoteError):
1178 raise result
1179 raise result
1179
1180
1180 return result
1181 return result
1181
1182
1182 def send_apply_request(self, socket, f, args=None, kwargs=None, subheader=None, track=False,
1183 def send_apply_request(self, socket, f, args=None, kwargs=None, subheader=None, track=False,
1183 ident=None):
1184 ident=None):
1184 """construct and send an apply message via a socket.
1185 """construct and send an apply message via a socket.
1185
1186
1186 This is the principal method with which all engine execution is performed by views.
1187 This is the principal method with which all engine execution is performed by views.
1187 """
1188 """
1188
1189
1189 if self._closed:
1190 if self._closed:
1190 raise RuntimeError("Client cannot be used after its sockets have been closed")
1191 raise RuntimeError("Client cannot be used after its sockets have been closed")
1191
1192
1192 # defaults:
1193 # defaults:
1193 args = args if args is not None else []
1194 args = args if args is not None else []
1194 kwargs = kwargs if kwargs is not None else {}
1195 kwargs = kwargs if kwargs is not None else {}
1195 subheader = subheader if subheader is not None else {}
1196 subheader = subheader if subheader is not None else {}
1196
1197
1197 # validate arguments
1198 # validate arguments
1198 if not callable(f) and not isinstance(f, Reference):
1199 if not callable(f) and not isinstance(f, Reference):
1199 raise TypeError("f must be callable, not %s"%type(f))
1200 raise TypeError("f must be callable, not %s"%type(f))
1200 if not isinstance(args, (tuple, list)):
1201 if not isinstance(args, (tuple, list)):
1201 raise TypeError("args must be tuple or list, not %s"%type(args))
1202 raise TypeError("args must be tuple or list, not %s"%type(args))
1202 if not isinstance(kwargs, dict):
1203 if not isinstance(kwargs, dict):
1203 raise TypeError("kwargs must be dict, not %s"%type(kwargs))
1204 raise TypeError("kwargs must be dict, not %s"%type(kwargs))
1204 if not isinstance(subheader, dict):
1205 if not isinstance(subheader, dict):
1205 raise TypeError("subheader must be dict, not %s"%type(subheader))
1206 raise TypeError("subheader must be dict, not %s"%type(subheader))
1206
1207
1207 bufs = util.pack_apply_message(f,args,kwargs)
1208 bufs = util.pack_apply_message(f,args,kwargs)
1208
1209
1209 msg = self.session.send(socket, "apply_request", buffers=bufs, ident=ident,
1210 msg = self.session.send(socket, "apply_request", buffers=bufs, ident=ident,
1210 subheader=subheader, track=track)
1211 subheader=subheader, track=track)
1211
1212
1212 msg_id = msg['header']['msg_id']
1213 msg_id = msg['header']['msg_id']
1213 self.outstanding.add(msg_id)
1214 self.outstanding.add(msg_id)
1214 if ident:
1215 if ident:
1215 # possibly routed to a specific engine
1216 # possibly routed to a specific engine
1216 if isinstance(ident, list):
1217 if isinstance(ident, list):
1217 ident = ident[-1]
1218 ident = ident[-1]
1218 if ident in self._engines.values():
1219 if ident in self._engines.values():
1219 # save for later, in case of engine death
1220 # save for later, in case of engine death
1220 self._outstanding_dict[ident].add(msg_id)
1221 self._outstanding_dict[ident].add(msg_id)
1221 self.history.append(msg_id)
1222 self.history.append(msg_id)
1222 self.metadata[msg_id]['submitted'] = datetime.now()
1223 self.metadata[msg_id]['submitted'] = datetime.now()
1223
1224
1224 return msg
1225 return msg
1225
1226
1226 def send_execute_request(self, socket, code, silent=True, subheader=None, ident=None):
1227 def send_execute_request(self, socket, code, silent=True, subheader=None, ident=None):
1227 """construct and send an execute request via a socket.
1228 """construct and send an execute request via a socket.
1228
1229
1229 """
1230 """
1230
1231
1231 if self._closed:
1232 if self._closed:
1232 raise RuntimeError("Client cannot be used after its sockets have been closed")
1233 raise RuntimeError("Client cannot be used after its sockets have been closed")
1233
1234
1234 # defaults:
1235 # defaults:
1235 subheader = subheader if subheader is not None else {}
1236 subheader = subheader if subheader is not None else {}
1236
1237
1237 # validate arguments
1238 # validate arguments
1238 if not isinstance(code, basestring):
1239 if not isinstance(code, basestring):
1239 raise TypeError("code must be text, not %s" % type(code))
1240 raise TypeError("code must be text, not %s" % type(code))
1240 if not isinstance(subheader, dict):
1241 if not isinstance(subheader, dict):
1241 raise TypeError("subheader must be dict, not %s" % type(subheader))
1242 raise TypeError("subheader must be dict, not %s" % type(subheader))
1242
1243
1243 content = dict(code=code, silent=bool(silent), user_variables=[], user_expressions={})
1244 content = dict(code=code, silent=bool(silent), user_variables=[], user_expressions={})
1244
1245
1245
1246
1246 msg = self.session.send(socket, "execute_request", content=content, ident=ident,
1247 msg = self.session.send(socket, "execute_request", content=content, ident=ident,
1247 subheader=subheader)
1248 subheader=subheader)
1248
1249
1249 msg_id = msg['header']['msg_id']
1250 msg_id = msg['header']['msg_id']
1250 self.outstanding.add(msg_id)
1251 self.outstanding.add(msg_id)
1251 if ident:
1252 if ident:
1252 # possibly routed to a specific engine
1253 # possibly routed to a specific engine
1253 if isinstance(ident, list):
1254 if isinstance(ident, list):
1254 ident = ident[-1]
1255 ident = ident[-1]
1255 if ident in self._engines.values():
1256 if ident in self._engines.values():
1256 # save for later, in case of engine death
1257 # save for later, in case of engine death
1257 self._outstanding_dict[ident].add(msg_id)
1258 self._outstanding_dict[ident].add(msg_id)
1258 self.history.append(msg_id)
1259 self.history.append(msg_id)
1259 self.metadata[msg_id]['submitted'] = datetime.now()
1260 self.metadata[msg_id]['submitted'] = datetime.now()
1260
1261
1261 return msg
1262 return msg
1262
1263
1263 #--------------------------------------------------------------------------
1264 #--------------------------------------------------------------------------
1264 # construct a View object
1265 # construct a View object
1265 #--------------------------------------------------------------------------
1266 #--------------------------------------------------------------------------
1266
1267
1267 def load_balanced_view(self, targets=None):
1268 def load_balanced_view(self, targets=None):
1268 """construct a DirectView object.
1269 """construct a DirectView object.
1269
1270
1270 If no arguments are specified, create a LoadBalancedView
1271 If no arguments are specified, create a LoadBalancedView
1271 using all engines.
1272 using all engines.
1272
1273
1273 Parameters
1274 Parameters
1274 ----------
1275 ----------
1275
1276
1276 targets: list,slice,int,etc. [default: use all engines]
1277 targets: list,slice,int,etc. [default: use all engines]
1277 The subset of engines across which to load-balance
1278 The subset of engines across which to load-balance
1278 """
1279 """
1279 if targets == 'all':
1280 if targets == 'all':
1280 targets = None
1281 targets = None
1281 if targets is not None:
1282 if targets is not None:
1282 targets = self._build_targets(targets)[1]
1283 targets = self._build_targets(targets)[1]
1283 return LoadBalancedView(client=self, socket=self._task_socket, targets=targets)
1284 return LoadBalancedView(client=self, socket=self._task_socket, targets=targets)
1284
1285
1285 def direct_view(self, targets='all'):
1286 def direct_view(self, targets='all'):
1286 """construct a DirectView object.
1287 """construct a DirectView object.
1287
1288
1288 If no targets are specified, create a DirectView using all engines.
1289 If no targets are specified, create a DirectView using all engines.
1289
1290
1290 rc.direct_view('all') is distinguished from rc[:] in that 'all' will
1291 rc.direct_view('all') is distinguished from rc[:] in that 'all' will
1291 evaluate the target engines at each execution, whereas rc[:] will connect to
1292 evaluate the target engines at each execution, whereas rc[:] will connect to
1292 all *current* engines, and that list will not change.
1293 all *current* engines, and that list will not change.
1293
1294
1294 That is, 'all' will always use all engines, whereas rc[:] will not use
1295 That is, 'all' will always use all engines, whereas rc[:] will not use
1295 engines added after the DirectView is constructed.
1296 engines added after the DirectView is constructed.
1296
1297
1297 Parameters
1298 Parameters
1298 ----------
1299 ----------
1299
1300
1300 targets: list,slice,int,etc. [default: use all engines]
1301 targets: list,slice,int,etc. [default: use all engines]
1301 The engines to use for the View
1302 The engines to use for the View
1302 """
1303 """
1303 single = isinstance(targets, int)
1304 single = isinstance(targets, int)
1304 # allow 'all' to be lazily evaluated at each execution
1305 # allow 'all' to be lazily evaluated at each execution
1305 if targets != 'all':
1306 if targets != 'all':
1306 targets = self._build_targets(targets)[1]
1307 targets = self._build_targets(targets)[1]
1307 if single:
1308 if single:
1308 targets = targets[0]
1309 targets = targets[0]
1309 return DirectView(client=self, socket=self._mux_socket, targets=targets)
1310 return DirectView(client=self, socket=self._mux_socket, targets=targets)
1310
1311
1311 #--------------------------------------------------------------------------
1312 #--------------------------------------------------------------------------
1312 # Query methods
1313 # Query methods
1313 #--------------------------------------------------------------------------
1314 #--------------------------------------------------------------------------
1314
1315
1315 @spin_first
1316 @spin_first
1316 def get_result(self, indices_or_msg_ids=None, block=None):
1317 def get_result(self, indices_or_msg_ids=None, block=None):
1317 """Retrieve a result by msg_id or history index, wrapped in an AsyncResult object.
1318 """Retrieve a result by msg_id or history index, wrapped in an AsyncResult object.
1318
1319
1319 If the client already has the results, no request to the Hub will be made.
1320 If the client already has the results, no request to the Hub will be made.
1320
1321
1321 This is a convenient way to construct AsyncResult objects, which are wrappers
1322 This is a convenient way to construct AsyncResult objects, which are wrappers
1322 that include metadata about execution, and allow for awaiting results that
1323 that include metadata about execution, and allow for awaiting results that
1323 were not submitted by this Client.
1324 were not submitted by this Client.
1324
1325
1325 It can also be a convenient way to retrieve the metadata associated with
1326 It can also be a convenient way to retrieve the metadata associated with
1326 blocking execution, since it always retrieves
1327 blocking execution, since it always retrieves
1327
1328
1328 Examples
1329 Examples
1329 --------
1330 --------
1330 ::
1331 ::
1331
1332
1332 In [10]: r = client.apply()
1333 In [10]: r = client.apply()
1333
1334
1334 Parameters
1335 Parameters
1335 ----------
1336 ----------
1336
1337
1337 indices_or_msg_ids : integer history index, str msg_id, or list of either
1338 indices_or_msg_ids : integer history index, str msg_id, or list of either
1338 The indices or msg_ids of indices to be retrieved
1339 The indices or msg_ids of indices to be retrieved
1339
1340
1340 block : bool
1341 block : bool
1341 Whether to wait for the result to be done
1342 Whether to wait for the result to be done
1342
1343
1343 Returns
1344 Returns
1344 -------
1345 -------
1345
1346
1346 AsyncResult
1347 AsyncResult
1347 A single AsyncResult object will always be returned.
1348 A single AsyncResult object will always be returned.
1348
1349
1349 AsyncHubResult
1350 AsyncHubResult
1350 A subclass of AsyncResult that retrieves results from the Hub
1351 A subclass of AsyncResult that retrieves results from the Hub
1351
1352
1352 """
1353 """
1353 block = self.block if block is None else block
1354 block = self.block if block is None else block
1354 if indices_or_msg_ids is None:
1355 if indices_or_msg_ids is None:
1355 indices_or_msg_ids = -1
1356 indices_or_msg_ids = -1
1356
1357
1357 if not isinstance(indices_or_msg_ids, (list,tuple)):
1358 if not isinstance(indices_or_msg_ids, (list,tuple)):
1358 indices_or_msg_ids = [indices_or_msg_ids]
1359 indices_or_msg_ids = [indices_or_msg_ids]
1359
1360
1360 theids = []
1361 theids = []
1361 for id in indices_or_msg_ids:
1362 for id in indices_or_msg_ids:
1362 if isinstance(id, int):
1363 if isinstance(id, int):
1363 id = self.history[id]
1364 id = self.history[id]
1364 if not isinstance(id, basestring):
1365 if not isinstance(id, basestring):
1365 raise TypeError("indices must be str or int, not %r"%id)
1366 raise TypeError("indices must be str or int, not %r"%id)
1366 theids.append(id)
1367 theids.append(id)
1367
1368
1368 local_ids = filter(lambda msg_id: msg_id in self.history or msg_id in self.results, theids)
1369 local_ids = filter(lambda msg_id: msg_id in self.history or msg_id in self.results, theids)
1369 remote_ids = filter(lambda msg_id: msg_id not in local_ids, theids)
1370 remote_ids = filter(lambda msg_id: msg_id not in local_ids, theids)
1370
1371
1371 if remote_ids:
1372 if remote_ids:
1372 ar = AsyncHubResult(self, msg_ids=theids)
1373 ar = AsyncHubResult(self, msg_ids=theids)
1373 else:
1374 else:
1374 ar = AsyncResult(self, msg_ids=theids)
1375 ar = AsyncResult(self, msg_ids=theids)
1375
1376
1376 if block:
1377 if block:
1377 ar.wait()
1378 ar.wait()
1378
1379
1379 return ar
1380 return ar
1380
1381
1381 @spin_first
1382 @spin_first
1382 def resubmit(self, indices_or_msg_ids=None, subheader=None, block=None):
1383 def resubmit(self, indices_or_msg_ids=None, subheader=None, block=None):
1383 """Resubmit one or more tasks.
1384 """Resubmit one or more tasks.
1384
1385
1385 in-flight tasks may not be resubmitted.
1386 in-flight tasks may not be resubmitted.
1386
1387
1387 Parameters
1388 Parameters
1388 ----------
1389 ----------
1389
1390
1390 indices_or_msg_ids : integer history index, str msg_id, or list of either
1391 indices_or_msg_ids : integer history index, str msg_id, or list of either
1391 The indices or msg_ids of indices to be retrieved
1392 The indices or msg_ids of indices to be retrieved
1392
1393
1393 block : bool
1394 block : bool
1394 Whether to wait for the result to be done
1395 Whether to wait for the result to be done
1395
1396
1396 Returns
1397 Returns
1397 -------
1398 -------
1398
1399
1399 AsyncHubResult
1400 AsyncHubResult
1400 A subclass of AsyncResult that retrieves results from the Hub
1401 A subclass of AsyncResult that retrieves results from the Hub
1401
1402
1402 """
1403 """
1403 block = self.block if block is None else block
1404 block = self.block if block is None else block
1404 if indices_or_msg_ids is None:
1405 if indices_or_msg_ids is None:
1405 indices_or_msg_ids = -1
1406 indices_or_msg_ids = -1
1406
1407
1407 if not isinstance(indices_or_msg_ids, (list,tuple)):
1408 if not isinstance(indices_or_msg_ids, (list,tuple)):
1408 indices_or_msg_ids = [indices_or_msg_ids]
1409 indices_or_msg_ids = [indices_or_msg_ids]
1409
1410
1410 theids = []
1411 theids = []
1411 for id in indices_or_msg_ids:
1412 for id in indices_or_msg_ids:
1412 if isinstance(id, int):
1413 if isinstance(id, int):
1413 id = self.history[id]
1414 id = self.history[id]
1414 if not isinstance(id, basestring):
1415 if not isinstance(id, basestring):
1415 raise TypeError("indices must be str or int, not %r"%id)
1416 raise TypeError("indices must be str or int, not %r"%id)
1416 theids.append(id)
1417 theids.append(id)
1417
1418
1418 content = dict(msg_ids = theids)
1419 content = dict(msg_ids = theids)
1419
1420
1420 self.session.send(self._query_socket, 'resubmit_request', content)
1421 self.session.send(self._query_socket, 'resubmit_request', content)
1421
1422
1422 zmq.select([self._query_socket], [], [])
1423 zmq.select([self._query_socket], [], [])
1423 idents,msg = self.session.recv(self._query_socket, zmq.NOBLOCK)
1424 idents,msg = self.session.recv(self._query_socket, zmq.NOBLOCK)
1424 if self.debug:
1425 if self.debug:
1425 pprint(msg)
1426 pprint(msg)
1426 content = msg['content']
1427 content = msg['content']
1427 if content['status'] != 'ok':
1428 if content['status'] != 'ok':
1428 raise self._unwrap_exception(content)
1429 raise self._unwrap_exception(content)
1429 mapping = content['resubmitted']
1430 mapping = content['resubmitted']
1430 new_ids = [ mapping[msg_id] for msg_id in theids ]
1431 new_ids = [ mapping[msg_id] for msg_id in theids ]
1431
1432
1432 ar = AsyncHubResult(self, msg_ids=new_ids)
1433 ar = AsyncHubResult(self, msg_ids=new_ids)
1433
1434
1434 if block:
1435 if block:
1435 ar.wait()
1436 ar.wait()
1436
1437
1437 return ar
1438 return ar
1438
1439
1439 @spin_first
1440 @spin_first
1440 def result_status(self, msg_ids, status_only=True):
1441 def result_status(self, msg_ids, status_only=True):
1441 """Check on the status of the result(s) of the apply request with `msg_ids`.
1442 """Check on the status of the result(s) of the apply request with `msg_ids`.
1442
1443
1443 If status_only is False, then the actual results will be retrieved, else
1444 If status_only is False, then the actual results will be retrieved, else
1444 only the status of the results will be checked.
1445 only the status of the results will be checked.
1445
1446
1446 Parameters
1447 Parameters
1447 ----------
1448 ----------
1448
1449
1449 msg_ids : list of msg_ids
1450 msg_ids : list of msg_ids
1450 if int:
1451 if int:
1451 Passed as index to self.history for convenience.
1452 Passed as index to self.history for convenience.
1452 status_only : bool (default: True)
1453 status_only : bool (default: True)
1453 if False:
1454 if False:
1454 Retrieve the actual results of completed tasks.
1455 Retrieve the actual results of completed tasks.
1455
1456
1456 Returns
1457 Returns
1457 -------
1458 -------
1458
1459
1459 results : dict
1460 results : dict
1460 There will always be the keys 'pending' and 'completed', which will
1461 There will always be the keys 'pending' and 'completed', which will
1461 be lists of msg_ids that are incomplete or complete. If `status_only`
1462 be lists of msg_ids that are incomplete or complete. If `status_only`
1462 is False, then completed results will be keyed by their `msg_id`.
1463 is False, then completed results will be keyed by their `msg_id`.
1463 """
1464 """
1464 if not isinstance(msg_ids, (list,tuple)):
1465 if not isinstance(msg_ids, (list,tuple)):
1465 msg_ids = [msg_ids]
1466 msg_ids = [msg_ids]
1466
1467
1467 theids = []
1468 theids = []
1468 for msg_id in msg_ids:
1469 for msg_id in msg_ids:
1469 if isinstance(msg_id, int):
1470 if isinstance(msg_id, int):
1470 msg_id = self.history[msg_id]
1471 msg_id = self.history[msg_id]
1471 if not isinstance(msg_id, basestring):
1472 if not isinstance(msg_id, basestring):
1472 raise TypeError("msg_ids must be str, not %r"%msg_id)
1473 raise TypeError("msg_ids must be str, not %r"%msg_id)
1473 theids.append(msg_id)
1474 theids.append(msg_id)
1474
1475
1475 completed = []
1476 completed = []
1476 local_results = {}
1477 local_results = {}
1477
1478
1478 # comment this block out to temporarily disable local shortcut:
1479 # comment this block out to temporarily disable local shortcut:
1479 for msg_id in theids:
1480 for msg_id in theids:
1480 if msg_id in self.results:
1481 if msg_id in self.results:
1481 completed.append(msg_id)
1482 completed.append(msg_id)
1482 local_results[msg_id] = self.results[msg_id]
1483 local_results[msg_id] = self.results[msg_id]
1483 theids.remove(msg_id)
1484 theids.remove(msg_id)
1484
1485
1485 if theids: # some not locally cached
1486 if theids: # some not locally cached
1486 content = dict(msg_ids=theids, status_only=status_only)
1487 content = dict(msg_ids=theids, status_only=status_only)
1487 msg = self.session.send(self._query_socket, "result_request", content=content)
1488 msg = self.session.send(self._query_socket, "result_request", content=content)
1488 zmq.select([self._query_socket], [], [])
1489 zmq.select([self._query_socket], [], [])
1489 idents,msg = self.session.recv(self._query_socket, zmq.NOBLOCK)
1490 idents,msg = self.session.recv(self._query_socket, zmq.NOBLOCK)
1490 if self.debug:
1491 if self.debug:
1491 pprint(msg)
1492 pprint(msg)
1492 content = msg['content']
1493 content = msg['content']
1493 if content['status'] != 'ok':
1494 if content['status'] != 'ok':
1494 raise self._unwrap_exception(content)
1495 raise self._unwrap_exception(content)
1495 buffers = msg['buffers']
1496 buffers = msg['buffers']
1496 else:
1497 else:
1497 content = dict(completed=[],pending=[])
1498 content = dict(completed=[],pending=[])
1498
1499
1499 content['completed'].extend(completed)
1500 content['completed'].extend(completed)
1500
1501
1501 if status_only:
1502 if status_only:
1502 return content
1503 return content
1503
1504
1504 failures = []
1505 failures = []
1505 # load cached results into result:
1506 # load cached results into result:
1506 content.update(local_results)
1507 content.update(local_results)
1507
1508
1508 # update cache with results:
1509 # update cache with results:
1509 for msg_id in sorted(theids):
1510 for msg_id in sorted(theids):
1510 if msg_id in content['completed']:
1511 if msg_id in content['completed']:
1511 rec = content[msg_id]
1512 rec = content[msg_id]
1512 parent = rec['header']
1513 parent = rec['header']
1513 header = rec['result_header']
1514 header = rec['result_header']
1514 rcontent = rec['result_content']
1515 rcontent = rec['result_content']
1515 iodict = rec['io']
1516 iodict = rec['io']
1516 if isinstance(rcontent, str):
1517 if isinstance(rcontent, str):
1517 rcontent = self.session.unpack(rcontent)
1518 rcontent = self.session.unpack(rcontent)
1518
1519
1519 md = self.metadata[msg_id]
1520 md = self.metadata[msg_id]
1520 md.update(self._extract_metadata(header, parent, rcontent))
1521 md.update(self._extract_metadata(header, parent, rcontent))
1521 if rec.get('received'):
1522 if rec.get('received'):
1522 md['received'] = rec['received']
1523 md['received'] = rec['received']
1523 md.update(iodict)
1524 md.update(iodict)
1524
1525
1525 if rcontent['status'] == 'ok':
1526 if rcontent['status'] == 'ok':
1526 if header['msg_type'] == 'apply_reply':
1527 if header['msg_type'] == 'apply_reply':
1527 res,buffers = util.unserialize_object(buffers)
1528 res,buffers = util.unserialize_object(buffers)
1528 elif header['msg_type'] == 'execute_reply':
1529 elif header['msg_type'] == 'execute_reply':
1529 res = ExecuteReply(msg_id, rcontent, md)
1530 res = ExecuteReply(msg_id, rcontent, md)
1530 else:
1531 else:
1531 raise KeyError("unhandled msg type: %r" % header[msg_type])
1532 raise KeyError("unhandled msg type: %r" % header[msg_type])
1532 else:
1533 else:
1533 res = self._unwrap_exception(rcontent)
1534 res = self._unwrap_exception(rcontent)
1534 failures.append(res)
1535 failures.append(res)
1535
1536
1536 self.results[msg_id] = res
1537 self.results[msg_id] = res
1537 content[msg_id] = res
1538 content[msg_id] = res
1538
1539
1539 if len(theids) == 1 and failures:
1540 if len(theids) == 1 and failures:
1540 raise failures[0]
1541 raise failures[0]
1541
1542
1542 error.collect_exceptions(failures, "result_status")
1543 error.collect_exceptions(failures, "result_status")
1543 return content
1544 return content
1544
1545
1545 @spin_first
1546 @spin_first
1546 def queue_status(self, targets='all', verbose=False):
1547 def queue_status(self, targets='all', verbose=False):
1547 """Fetch the status of engine queues.
1548 """Fetch the status of engine queues.
1548
1549
1549 Parameters
1550 Parameters
1550 ----------
1551 ----------
1551
1552
1552 targets : int/str/list of ints/strs
1553 targets : int/str/list of ints/strs
1553 the engines whose states are to be queried.
1554 the engines whose states are to be queried.
1554 default : all
1555 default : all
1555 verbose : bool
1556 verbose : bool
1556 Whether to return lengths only, or lists of ids for each element
1557 Whether to return lengths only, or lists of ids for each element
1557 """
1558 """
1558 if targets == 'all':
1559 if targets == 'all':
1559 # allow 'all' to be evaluated on the engine
1560 # allow 'all' to be evaluated on the engine
1560 engine_ids = None
1561 engine_ids = None
1561 else:
1562 else:
1562 engine_ids = self._build_targets(targets)[1]
1563 engine_ids = self._build_targets(targets)[1]
1563 content = dict(targets=engine_ids, verbose=verbose)
1564 content = dict(targets=engine_ids, verbose=verbose)
1564 self.session.send(self._query_socket, "queue_request", content=content)
1565 self.session.send(self._query_socket, "queue_request", content=content)
1565 idents,msg = self.session.recv(self._query_socket, 0)
1566 idents,msg = self.session.recv(self._query_socket, 0)
1566 if self.debug:
1567 if self.debug:
1567 pprint(msg)
1568 pprint(msg)
1568 content = msg['content']
1569 content = msg['content']
1569 status = content.pop('status')
1570 status = content.pop('status')
1570 if status != 'ok':
1571 if status != 'ok':
1571 raise self._unwrap_exception(content)
1572 raise self._unwrap_exception(content)
1572 content = rekey(content)
1573 content = rekey(content)
1573 if isinstance(targets, int):
1574 if isinstance(targets, int):
1574 return content[targets]
1575 return content[targets]
1575 else:
1576 else:
1576 return content
1577 return content
1577
1578
1578 @spin_first
1579 @spin_first
1579 def purge_results(self, jobs=[], targets=[]):
1580 def purge_results(self, jobs=[], targets=[]):
1580 """Tell the Hub to forget results.
1581 """Tell the Hub to forget results.
1581
1582
1582 Individual results can be purged by msg_id, or the entire
1583 Individual results can be purged by msg_id, or the entire
1583 history of specific targets can be purged.
1584 history of specific targets can be purged.
1584
1585
1585 Use `purge_results('all')` to scrub everything from the Hub's db.
1586 Use `purge_results('all')` to scrub everything from the Hub's db.
1586
1587
1587 Parameters
1588 Parameters
1588 ----------
1589 ----------
1589
1590
1590 jobs : str or list of str or AsyncResult objects
1591 jobs : str or list of str or AsyncResult objects
1591 the msg_ids whose results should be forgotten.
1592 the msg_ids whose results should be forgotten.
1592 targets : int/str/list of ints/strs
1593 targets : int/str/list of ints/strs
1593 The targets, by int_id, whose entire history is to be purged.
1594 The targets, by int_id, whose entire history is to be purged.
1594
1595
1595 default : None
1596 default : None
1596 """
1597 """
1597 if not targets and not jobs:
1598 if not targets and not jobs:
1598 raise ValueError("Must specify at least one of `targets` and `jobs`")
1599 raise ValueError("Must specify at least one of `targets` and `jobs`")
1599 if targets:
1600 if targets:
1600 targets = self._build_targets(targets)[1]
1601 targets = self._build_targets(targets)[1]
1601
1602
1602 # construct msg_ids from jobs
1603 # construct msg_ids from jobs
1603 if jobs == 'all':
1604 if jobs == 'all':
1604 msg_ids = jobs
1605 msg_ids = jobs
1605 else:
1606 else:
1606 msg_ids = []
1607 msg_ids = []
1607 if isinstance(jobs, (basestring,AsyncResult)):
1608 if isinstance(jobs, (basestring,AsyncResult)):
1608 jobs = [jobs]
1609 jobs = [jobs]
1609 bad_ids = filter(lambda obj: not isinstance(obj, (basestring, AsyncResult)), jobs)
1610 bad_ids = filter(lambda obj: not isinstance(obj, (basestring, AsyncResult)), jobs)
1610 if bad_ids:
1611 if bad_ids:
1611 raise TypeError("Invalid msg_id type %r, expected str or AsyncResult"%bad_ids[0])
1612 raise TypeError("Invalid msg_id type %r, expected str or AsyncResult"%bad_ids[0])
1612 for j in jobs:
1613 for j in jobs:
1613 if isinstance(j, AsyncResult):
1614 if isinstance(j, AsyncResult):
1614 msg_ids.extend(j.msg_ids)
1615 msg_ids.extend(j.msg_ids)
1615 else:
1616 else:
1616 msg_ids.append(j)
1617 msg_ids.append(j)
1617
1618
1618 content = dict(engine_ids=targets, msg_ids=msg_ids)
1619 content = dict(engine_ids=targets, msg_ids=msg_ids)
1619 self.session.send(self._query_socket, "purge_request", content=content)
1620 self.session.send(self._query_socket, "purge_request", content=content)
1620 idents, msg = self.session.recv(self._query_socket, 0)
1621 idents, msg = self.session.recv(self._query_socket, 0)
1621 if self.debug:
1622 if self.debug:
1622 pprint(msg)
1623 pprint(msg)
1623 content = msg['content']
1624 content = msg['content']
1624 if content['status'] != 'ok':
1625 if content['status'] != 'ok':
1625 raise self._unwrap_exception(content)
1626 raise self._unwrap_exception(content)
1626
1627
1627 @spin_first
1628 @spin_first
1628 def hub_history(self):
1629 def hub_history(self):
1629 """Get the Hub's history
1630 """Get the Hub's history
1630
1631
1631 Just like the Client, the Hub has a history, which is a list of msg_ids.
1632 Just like the Client, the Hub has a history, which is a list of msg_ids.
1632 This will contain the history of all clients, and, depending on configuration,
1633 This will contain the history of all clients, and, depending on configuration,
1633 may contain history across multiple cluster sessions.
1634 may contain history across multiple cluster sessions.
1634
1635
1635 Any msg_id returned here is a valid argument to `get_result`.
1636 Any msg_id returned here is a valid argument to `get_result`.
1636
1637
1637 Returns
1638 Returns
1638 -------
1639 -------
1639
1640
1640 msg_ids : list of strs
1641 msg_ids : list of strs
1641 list of all msg_ids, ordered by task submission time.
1642 list of all msg_ids, ordered by task submission time.
1642 """
1643 """
1643
1644
1644 self.session.send(self._query_socket, "history_request", content={})
1645 self.session.send(self._query_socket, "history_request", content={})
1645 idents, msg = self.session.recv(self._query_socket, 0)
1646 idents, msg = self.session.recv(self._query_socket, 0)
1646
1647
1647 if self.debug:
1648 if self.debug:
1648 pprint(msg)
1649 pprint(msg)
1649 content = msg['content']
1650 content = msg['content']
1650 if content['status'] != 'ok':
1651 if content['status'] != 'ok':
1651 raise self._unwrap_exception(content)
1652 raise self._unwrap_exception(content)
1652 else:
1653 else:
1653 return content['history']
1654 return content['history']
1654
1655
1655 @spin_first
1656 @spin_first
1656 def db_query(self, query, keys=None):
1657 def db_query(self, query, keys=None):
1657 """Query the Hub's TaskRecord database
1658 """Query the Hub's TaskRecord database
1658
1659
1659 This will return a list of task record dicts that match `query`
1660 This will return a list of task record dicts that match `query`
1660
1661
1661 Parameters
1662 Parameters
1662 ----------
1663 ----------
1663
1664
1664 query : mongodb query dict
1665 query : mongodb query dict
1665 The search dict. See mongodb query docs for details.
1666 The search dict. See mongodb query docs for details.
1666 keys : list of strs [optional]
1667 keys : list of strs [optional]
1667 The subset of keys to be returned. The default is to fetch everything but buffers.
1668 The subset of keys to be returned. The default is to fetch everything but buffers.
1668 'msg_id' will *always* be included.
1669 'msg_id' will *always* be included.
1669 """
1670 """
1670 if isinstance(keys, basestring):
1671 if isinstance(keys, basestring):
1671 keys = [keys]
1672 keys = [keys]
1672 content = dict(query=query, keys=keys)
1673 content = dict(query=query, keys=keys)
1673 self.session.send(self._query_socket, "db_request", content=content)
1674 self.session.send(self._query_socket, "db_request", content=content)
1674 idents, msg = self.session.recv(self._query_socket, 0)
1675 idents, msg = self.session.recv(self._query_socket, 0)
1675 if self.debug:
1676 if self.debug:
1676 pprint(msg)
1677 pprint(msg)
1677 content = msg['content']
1678 content = msg['content']
1678 if content['status'] != 'ok':
1679 if content['status'] != 'ok':
1679 raise self._unwrap_exception(content)
1680 raise self._unwrap_exception(content)
1680
1681
1681 records = content['records']
1682 records = content['records']
1682
1683
1683 buffer_lens = content['buffer_lens']
1684 buffer_lens = content['buffer_lens']
1684 result_buffer_lens = content['result_buffer_lens']
1685 result_buffer_lens = content['result_buffer_lens']
1685 buffers = msg['buffers']
1686 buffers = msg['buffers']
1686 has_bufs = buffer_lens is not None
1687 has_bufs = buffer_lens is not None
1687 has_rbufs = result_buffer_lens is not None
1688 has_rbufs = result_buffer_lens is not None
1688 for i,rec in enumerate(records):
1689 for i,rec in enumerate(records):
1689 # relink buffers
1690 # relink buffers
1690 if has_bufs:
1691 if has_bufs:
1691 blen = buffer_lens[i]
1692 blen = buffer_lens[i]
1692 rec['buffers'], buffers = buffers[:blen],buffers[blen:]
1693 rec['buffers'], buffers = buffers[:blen],buffers[blen:]
1693 if has_rbufs:
1694 if has_rbufs:
1694 blen = result_buffer_lens[i]
1695 blen = result_buffer_lens[i]
1695 rec['result_buffers'], buffers = buffers[:blen],buffers[blen:]
1696 rec['result_buffers'], buffers = buffers[:blen],buffers[blen:]
1696
1697
1697 return records
1698 return records
1698
1699
1699 __all__ = [ 'Client' ]
1700 __all__ = [ 'Client' ]
@@ -1,1341 +1,1401 b''
1 """The IPython Controller Hub with 0MQ
1 """The IPython Controller Hub with 0MQ
2 This is the master object that handles connections from engines and clients,
2 This is the master object that handles connections from engines and clients,
3 and monitors traffic through the various queues.
3 and monitors traffic through the various queues.
4
4
5 Authors:
5 Authors:
6
6
7 * Min RK
7 * Min RK
8 """
8 """
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10 # Copyright (C) 2010-2011 The IPython Development Team
10 # Copyright (C) 2010-2011 The IPython Development Team
11 #
11 #
12 # Distributed under the terms of the BSD License. The full license is in
12 # Distributed under the terms of the BSD License. The full license is in
13 # the file COPYING, distributed as part of this software.
13 # the file COPYING, distributed as part of this software.
14 #-----------------------------------------------------------------------------
14 #-----------------------------------------------------------------------------
15
15
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17 # Imports
17 # Imports
18 #-----------------------------------------------------------------------------
18 #-----------------------------------------------------------------------------
19 from __future__ import print_function
19 from __future__ import print_function
20
20
21 import json
22 import os
21 import sys
23 import sys
22 import time
24 import time
23 from datetime import datetime
25 from datetime import datetime
24
26
25 import zmq
27 import zmq
26 from zmq.eventloop import ioloop
28 from zmq.eventloop import ioloop
27 from zmq.eventloop.zmqstream import ZMQStream
29 from zmq.eventloop.zmqstream import ZMQStream
28
30
29 # internal:
31 # internal:
30 from IPython.utils.importstring import import_item
32 from IPython.utils.importstring import import_item
31 from IPython.utils.py3compat import cast_bytes
33 from IPython.utils.py3compat import cast_bytes
32 from IPython.utils.traitlets import (
34 from IPython.utils.traitlets import (
33 HasTraits, Instance, Integer, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
35 HasTraits, Instance, Integer, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
34 )
36 )
35
37
36 from IPython.parallel import error, util
38 from IPython.parallel import error, util
37 from IPython.parallel.factory import RegistrationFactory
39 from IPython.parallel.factory import RegistrationFactory
38
40
39 from IPython.zmq.session import SessionFactory
41 from IPython.zmq.session import SessionFactory
40
42
41 from .heartmonitor import HeartMonitor
43 from .heartmonitor import HeartMonitor
42
44
43 #-----------------------------------------------------------------------------
45 #-----------------------------------------------------------------------------
44 # Code
46 # Code
45 #-----------------------------------------------------------------------------
47 #-----------------------------------------------------------------------------
46
48
47 def _passer(*args, **kwargs):
49 def _passer(*args, **kwargs):
48 return
50 return
49
51
50 def _printer(*args, **kwargs):
52 def _printer(*args, **kwargs):
51 print (args)
53 print (args)
52 print (kwargs)
54 print (kwargs)
53
55
54 def empty_record():
56 def empty_record():
55 """Return an empty dict with all record keys."""
57 """Return an empty dict with all record keys."""
56 return {
58 return {
57 'msg_id' : None,
59 'msg_id' : None,
58 'header' : None,
60 'header' : None,
59 'content': None,
61 'content': None,
60 'buffers': None,
62 'buffers': None,
61 'submitted': None,
63 'submitted': None,
62 'client_uuid' : None,
64 'client_uuid' : None,
63 'engine_uuid' : None,
65 'engine_uuid' : None,
64 'started': None,
66 'started': None,
65 'completed': None,
67 'completed': None,
66 'resubmitted': None,
68 'resubmitted': None,
67 'received': None,
69 'received': None,
68 'result_header' : None,
70 'result_header' : None,
69 'result_content' : None,
71 'result_content' : None,
70 'result_buffers' : None,
72 'result_buffers' : None,
71 'queue' : None,
73 'queue' : None,
72 'pyin' : None,
74 'pyin' : None,
73 'pyout': None,
75 'pyout': None,
74 'pyerr': None,
76 'pyerr': None,
75 'stdout': '',
77 'stdout': '',
76 'stderr': '',
78 'stderr': '',
77 }
79 }
78
80
79 def init_record(msg):
81 def init_record(msg):
80 """Initialize a TaskRecord based on a request."""
82 """Initialize a TaskRecord based on a request."""
81 header = msg['header']
83 header = msg['header']
82 return {
84 return {
83 'msg_id' : header['msg_id'],
85 'msg_id' : header['msg_id'],
84 'header' : header,
86 'header' : header,
85 'content': msg['content'],
87 'content': msg['content'],
86 'buffers': msg['buffers'],
88 'buffers': msg['buffers'],
87 'submitted': header['date'],
89 'submitted': header['date'],
88 'client_uuid' : None,
90 'client_uuid' : None,
89 'engine_uuid' : None,
91 'engine_uuid' : None,
90 'started': None,
92 'started': None,
91 'completed': None,
93 'completed': None,
92 'resubmitted': None,
94 'resubmitted': None,
93 'received': None,
95 'received': None,
94 'result_header' : None,
96 'result_header' : None,
95 'result_content' : None,
97 'result_content' : None,
96 'result_buffers' : None,
98 'result_buffers' : None,
97 'queue' : None,
99 'queue' : None,
98 'pyin' : None,
100 'pyin' : None,
99 'pyout': None,
101 'pyout': None,
100 'pyerr': None,
102 'pyerr': None,
101 'stdout': '',
103 'stdout': '',
102 'stderr': '',
104 'stderr': '',
103 }
105 }
104
106
105
107
106 class EngineConnector(HasTraits):
108 class EngineConnector(HasTraits):
107 """A simple object for accessing the various zmq connections of an object.
109 """A simple object for accessing the various zmq connections of an object.
108 Attributes are:
110 Attributes are:
109 id (int): engine ID
111 id (int): engine ID
110 uuid (str): uuid (unused?)
112 uuid (unicode): engine UUID
111 queue (str): identity of queue's DEALER socket
113 pending: set of msg_ids
112 registration (str): identity of registration DEALER socket
114 stallback: DelayedCallback for stalled registration
113 heartbeat (str): identity of heartbeat DEALER socket
114 """
115 """
116
115 id=Integer(0)
117 id = Integer(0)
116 queue=CBytes()
118 uuid = Unicode()
117 control=CBytes()
118 registration=CBytes()
119 heartbeat=CBytes()
120 pending=Set()
119 pending = Set()
120 stallback = Instance(ioloop.DelayedCallback)
121
121
122
122 _db_shortcuts = {
123 _db_shortcuts = {
123 'sqlitedb' : 'IPython.parallel.controller.sqlitedb.SQLiteDB',
124 'sqlitedb' : 'IPython.parallel.controller.sqlitedb.SQLiteDB',
124 'mongodb' : 'IPython.parallel.controller.mongodb.MongoDB',
125 'mongodb' : 'IPython.parallel.controller.mongodb.MongoDB',
125 'dictdb' : 'IPython.parallel.controller.dictdb.DictDB',
126 'dictdb' : 'IPython.parallel.controller.dictdb.DictDB',
126 'nodb' : 'IPython.parallel.controller.dictdb.NoDB',
127 'nodb' : 'IPython.parallel.controller.dictdb.NoDB',
127 }
128 }
128
129
129 class HubFactory(RegistrationFactory):
130 class HubFactory(RegistrationFactory):
130 """The Configurable for setting up a Hub."""
131 """The Configurable for setting up a Hub."""
131
132
132 # port-pairs for monitoredqueues:
133 # port-pairs for monitoredqueues:
133 hb = Tuple(Integer,Integer,config=True,
134 hb = Tuple(Integer,Integer,config=True,
134 help="""PUB/ROUTER Port pair for Engine heartbeats""")
135 help="""PUB/ROUTER Port pair for Engine heartbeats""")
135 def _hb_default(self):
136 def _hb_default(self):
136 return tuple(util.select_random_ports(2))
137 return tuple(util.select_random_ports(2))
137
138
138 mux = Tuple(Integer,Integer,config=True,
139 mux = Tuple(Integer,Integer,config=True,
139 help="""Client/Engine Port pair for MUX queue""")
140 help="""Client/Engine Port pair for MUX queue""")
140
141
141 def _mux_default(self):
142 def _mux_default(self):
142 return tuple(util.select_random_ports(2))
143 return tuple(util.select_random_ports(2))
143
144
144 task = Tuple(Integer,Integer,config=True,
145 task = Tuple(Integer,Integer,config=True,
145 help="""Client/Engine Port pair for Task queue""")
146 help="""Client/Engine Port pair for Task queue""")
146 def _task_default(self):
147 def _task_default(self):
147 return tuple(util.select_random_ports(2))
148 return tuple(util.select_random_ports(2))
148
149
149 control = Tuple(Integer,Integer,config=True,
150 control = Tuple(Integer,Integer,config=True,
150 help="""Client/Engine Port pair for Control queue""")
151 help="""Client/Engine Port pair for Control queue""")
151
152
152 def _control_default(self):
153 def _control_default(self):
153 return tuple(util.select_random_ports(2))
154 return tuple(util.select_random_ports(2))
154
155
155 iopub = Tuple(Integer,Integer,config=True,
156 iopub = Tuple(Integer,Integer,config=True,
156 help="""Client/Engine Port pair for IOPub relay""")
157 help="""Client/Engine Port pair for IOPub relay""")
157
158
158 def _iopub_default(self):
159 def _iopub_default(self):
159 return tuple(util.select_random_ports(2))
160 return tuple(util.select_random_ports(2))
160
161
161 # single ports:
162 # single ports:
162 mon_port = Integer(config=True,
163 mon_port = Integer(config=True,
163 help="""Monitor (SUB) port for queue traffic""")
164 help="""Monitor (SUB) port for queue traffic""")
164
165
165 def _mon_port_default(self):
166 def _mon_port_default(self):
166 return util.select_random_ports(1)[0]
167 return util.select_random_ports(1)[0]
167
168
168 notifier_port = Integer(config=True,
169 notifier_port = Integer(config=True,
169 help="""PUB port for sending engine status notifications""")
170 help="""PUB port for sending engine status notifications""")
170
171
171 def _notifier_port_default(self):
172 def _notifier_port_default(self):
172 return util.select_random_ports(1)[0]
173 return util.select_random_ports(1)[0]
173
174
174 engine_ip = Unicode('127.0.0.1', config=True,
175 engine_ip = Unicode('127.0.0.1', config=True,
175 help="IP on which to listen for engine connections. [default: loopback]")
176 help="IP on which to listen for engine connections. [default: loopback]")
176 engine_transport = Unicode('tcp', config=True,
177 engine_transport = Unicode('tcp', config=True,
177 help="0MQ transport for engine connections. [default: tcp]")
178 help="0MQ transport for engine connections. [default: tcp]")
178
179
179 client_ip = Unicode('127.0.0.1', config=True,
180 client_ip = Unicode('127.0.0.1', config=True,
180 help="IP on which to listen for client connections. [default: loopback]")
181 help="IP on which to listen for client connections. [default: loopback]")
181 client_transport = Unicode('tcp', config=True,
182 client_transport = Unicode('tcp', config=True,
182 help="0MQ transport for client connections. [default : tcp]")
183 help="0MQ transport for client connections. [default : tcp]")
183
184
184 monitor_ip = Unicode('127.0.0.1', config=True,
185 monitor_ip = Unicode('127.0.0.1', config=True,
185 help="IP on which to listen for monitor messages. [default: loopback]")
186 help="IP on which to listen for monitor messages. [default: loopback]")
186 monitor_transport = Unicode('tcp', config=True,
187 monitor_transport = Unicode('tcp', config=True,
187 help="0MQ transport for monitor messages. [default : tcp]")
188 help="0MQ transport for monitor messages. [default : tcp]")
188
189
189 monitor_url = Unicode('')
190 monitor_url = Unicode('')
190
191
191 db_class = DottedObjectName('NoDB',
192 db_class = DottedObjectName('NoDB',
192 config=True, help="""The class to use for the DB backend
193 config=True, help="""The class to use for the DB backend
193
194
194 Options include:
195 Options include:
195
196
196 SQLiteDB: SQLite
197 SQLiteDB: SQLite
197 MongoDB : use MongoDB
198 MongoDB : use MongoDB
198 DictDB : in-memory storage (fastest, but be mindful of memory growth of the Hub)
199 DictDB : in-memory storage (fastest, but be mindful of memory growth of the Hub)
199 NoDB : disable database altogether (default)
200 NoDB : disable database altogether (default)
200
201
201 """)
202 """)
202
203
203 # not configurable
204 # not configurable
204 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
205 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
205 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
206 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
206
207
207 def _ip_changed(self, name, old, new):
208 def _ip_changed(self, name, old, new):
208 self.engine_ip = new
209 self.engine_ip = new
209 self.client_ip = new
210 self.client_ip = new
210 self.monitor_ip = new
211 self.monitor_ip = new
211 self._update_monitor_url()
212 self._update_monitor_url()
212
213
213 def _update_monitor_url(self):
214 def _update_monitor_url(self):
214 self.monitor_url = "%s://%s:%i" % (self.monitor_transport, self.monitor_ip, self.mon_port)
215 self.monitor_url = "%s://%s:%i" % (self.monitor_transport, self.monitor_ip, self.mon_port)
215
216
216 def _transport_changed(self, name, old, new):
217 def _transport_changed(self, name, old, new):
217 self.engine_transport = new
218 self.engine_transport = new
218 self.client_transport = new
219 self.client_transport = new
219 self.monitor_transport = new
220 self.monitor_transport = new
220 self._update_monitor_url()
221 self._update_monitor_url()
221
222
222 def __init__(self, **kwargs):
223 def __init__(self, **kwargs):
223 super(HubFactory, self).__init__(**kwargs)
224 super(HubFactory, self).__init__(**kwargs)
224 self._update_monitor_url()
225 self._update_monitor_url()
225
226
226
227
227 def construct(self):
228 def construct(self):
228 self.init_hub()
229 self.init_hub()
229
230
230 def start(self):
231 def start(self):
231 self.heartmonitor.start()
232 self.heartmonitor.start()
232 self.log.info("Heartmonitor started")
233 self.log.info("Heartmonitor started")
233
234
234 def client_url(self, channel):
235 def client_url(self, channel):
235 """return full zmq url for a named client channel"""
236 """return full zmq url for a named client channel"""
236 return "%s://%s:%i" % (self.client_transport, self.client_ip, self.client_info[channel])
237 return "%s://%s:%i" % (self.client_transport, self.client_ip, self.client_info[channel])
237
238
238 def engine_url(self, channel):
239 def engine_url(self, channel):
239 """return full zmq url for a named engine channel"""
240 """return full zmq url for a named engine channel"""
240 return "%s://%s:%i" % (self.engine_transport, self.engine_ip, self.engine_info[channel])
241 return "%s://%s:%i" % (self.engine_transport, self.engine_ip, self.engine_info[channel])
241
242
242 def init_hub(self):
243 def init_hub(self):
243 """construct Hub object"""
244 """construct Hub object"""
244
245
245 ctx = self.context
246 ctx = self.context
246 loop = self.loop
247 loop = self.loop
247
248
248 try:
249 try:
249 scheme = self.config.TaskScheduler.scheme_name
250 scheme = self.config.TaskScheduler.scheme_name
250 except AttributeError:
251 except AttributeError:
251 from .scheduler import TaskScheduler
252 from .scheduler import TaskScheduler
252 scheme = TaskScheduler.scheme_name.get_default_value()
253 scheme = TaskScheduler.scheme_name.get_default_value()
253
254
254 # build connection dicts
255 # build connection dicts
255 engine = self.engine_info = {
256 engine = self.engine_info = {
256 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip),
257 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip),
257 'registration' : self.regport,
258 'registration' : self.regport,
258 'control' : self.control[1],
259 'control' : self.control[1],
259 'mux' : self.mux[1],
260 'mux' : self.mux[1],
260 'hb_ping' : self.hb[0],
261 'hb_ping' : self.hb[0],
261 'hb_pong' : self.hb[1],
262 'hb_pong' : self.hb[1],
262 'task' : self.task[1],
263 'task' : self.task[1],
263 'iopub' : self.iopub[1],
264 'iopub' : self.iopub[1],
264 }
265 }
265
266
266 client = self.client_info = {
267 client = self.client_info = {
267 'interface' : "%s://%s" % (self.client_transport, self.client_ip),
268 'interface' : "%s://%s" % (self.client_transport, self.client_ip),
268 'registration' : self.regport,
269 'registration' : self.regport,
269 'control' : self.control[0],
270 'control' : self.control[0],
270 'mux' : self.mux[0],
271 'mux' : self.mux[0],
271 'task' : self.task[0],
272 'task' : self.task[0],
272 'task_scheme' : scheme,
273 'task_scheme' : scheme,
273 'iopub' : self.iopub[0],
274 'iopub' : self.iopub[0],
274 'notification' : self.notifier_port,
275 'notification' : self.notifier_port,
275 }
276 }
276
277
277 self.log.debug("Hub engine addrs: %s", self.engine_info)
278 self.log.debug("Hub engine addrs: %s", self.engine_info)
278 self.log.debug("Hub client addrs: %s", self.client_info)
279 self.log.debug("Hub client addrs: %s", self.client_info)
279
280
280 # Registrar socket
281 # Registrar socket
281 q = ZMQStream(ctx.socket(zmq.ROUTER), loop)
282 q = ZMQStream(ctx.socket(zmq.ROUTER), loop)
282 q.bind(self.client_url('registration'))
283 q.bind(self.client_url('registration'))
283 self.log.info("Hub listening on %s for registration.", self.client_url('registration'))
284 self.log.info("Hub listening on %s for registration.", self.client_url('registration'))
284 if self.client_ip != self.engine_ip:
285 if self.client_ip != self.engine_ip:
285 q.bind(self.engine_url('registration'))
286 q.bind(self.engine_url('registration'))
286 self.log.info("Hub listening on %s for registration.", self.engine_url('registration'))
287 self.log.info("Hub listening on %s for registration.", self.engine_url('registration'))
287
288
288 ### Engine connections ###
289 ### Engine connections ###
289
290
290 # heartbeat
291 # heartbeat
291 hpub = ctx.socket(zmq.PUB)
292 hpub = ctx.socket(zmq.PUB)
292 hpub.bind(self.engine_url('hb_ping'))
293 hpub.bind(self.engine_url('hb_ping'))
293 hrep = ctx.socket(zmq.ROUTER)
294 hrep = ctx.socket(zmq.ROUTER)
294 hrep.bind(self.engine_url('hb_pong'))
295 hrep.bind(self.engine_url('hb_pong'))
295 self.heartmonitor = HeartMonitor(loop=loop, config=self.config, log=self.log,
296 self.heartmonitor = HeartMonitor(loop=loop, config=self.config, log=self.log,
296 pingstream=ZMQStream(hpub,loop),
297 pingstream=ZMQStream(hpub,loop),
297 pongstream=ZMQStream(hrep,loop)
298 pongstream=ZMQStream(hrep,loop)
298 )
299 )
299
300
300 ### Client connections ###
301 ### Client connections ###
301
302
302 # Notifier socket
303 # Notifier socket
303 n = ZMQStream(ctx.socket(zmq.PUB), loop)
304 n = ZMQStream(ctx.socket(zmq.PUB), loop)
304 n.bind(self.client_url('notification'))
305 n.bind(self.client_url('notification'))
305
306
306 ### build and launch the queues ###
307 ### build and launch the queues ###
307
308
308 # monitor socket
309 # monitor socket
309 sub = ctx.socket(zmq.SUB)
310 sub = ctx.socket(zmq.SUB)
310 sub.setsockopt(zmq.SUBSCRIBE, b"")
311 sub.setsockopt(zmq.SUBSCRIBE, b"")
311 sub.bind(self.monitor_url)
312 sub.bind(self.monitor_url)
312 sub.bind('inproc://monitor')
313 sub.bind('inproc://monitor')
313 sub = ZMQStream(sub, loop)
314 sub = ZMQStream(sub, loop)
314
315
315 # connect the db
316 # connect the db
316 db_class = _db_shortcuts.get(self.db_class.lower(), self.db_class)
317 db_class = _db_shortcuts.get(self.db_class.lower(), self.db_class)
317 self.log.info('Hub using DB backend: %r', (db_class.split('.')[-1]))
318 self.log.info('Hub using DB backend: %r', (db_class.split('.')[-1]))
318 self.db = import_item(str(db_class))(session=self.session.session,
319 self.db = import_item(str(db_class))(session=self.session.session,
319 config=self.config, log=self.log)
320 config=self.config, log=self.log)
320 time.sleep(.25)
321 time.sleep(.25)
321
322
322 # resubmit stream
323 # resubmit stream
323 r = ZMQStream(ctx.socket(zmq.DEALER), loop)
324 r = ZMQStream(ctx.socket(zmq.DEALER), loop)
324 url = util.disambiguate_url(self.client_url('task'))
325 url = util.disambiguate_url(self.client_url('task'))
325 r.connect(url)
326 r.connect(url)
326
327
327 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
328 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
328 query=q, notifier=n, resubmit=r, db=self.db,
329 query=q, notifier=n, resubmit=r, db=self.db,
329 engine_info=self.engine_info, client_info=self.client_info,
330 engine_info=self.engine_info, client_info=self.client_info,
330 log=self.log)
331 log=self.log)
331
332
332
333
333 class Hub(SessionFactory):
334 class Hub(SessionFactory):
334 """The IPython Controller Hub with 0MQ connections
335 """The IPython Controller Hub with 0MQ connections
335
336
336 Parameters
337 Parameters
337 ==========
338 ==========
338 loop: zmq IOLoop instance
339 loop: zmq IOLoop instance
339 session: Session object
340 session: Session object
340 <removed> context: zmq context for creating new connections (?)
341 <removed> context: zmq context for creating new connections (?)
341 queue: ZMQStream for monitoring the command queue (SUB)
342 queue: ZMQStream for monitoring the command queue (SUB)
342 query: ZMQStream for engine registration and client queries requests (ROUTER)
343 query: ZMQStream for engine registration and client queries requests (ROUTER)
343 heartbeat: HeartMonitor object checking the pulse of the engines
344 heartbeat: HeartMonitor object checking the pulse of the engines
344 notifier: ZMQStream for broadcasting engine registration changes (PUB)
345 notifier: ZMQStream for broadcasting engine registration changes (PUB)
345 db: connection to db for out of memory logging of commands
346 db: connection to db for out of memory logging of commands
346 NotImplemented
347 NotImplemented
347 engine_info: dict of zmq connection information for engines to connect
348 engine_info: dict of zmq connection information for engines to connect
348 to the queues.
349 to the queues.
349 client_info: dict of zmq connection information for engines to connect
350 client_info: dict of zmq connection information for engines to connect
350 to the queues.
351 to the queues.
351 """
352 """
353
354 engine_state_file = Unicode()
355
352 # internal data structures:
356 # internal data structures:
353 ids=Set() # engine IDs
357 ids=Set() # engine IDs
354 keytable=Dict()
358 keytable=Dict()
355 by_ident=Dict()
359 by_ident=Dict()
356 engines=Dict()
360 engines=Dict()
357 clients=Dict()
361 clients=Dict()
358 hearts=Dict()
362 hearts=Dict()
359 pending=Set()
363 pending=Set()
360 queues=Dict() # pending msg_ids keyed by engine_id
364 queues=Dict() # pending msg_ids keyed by engine_id
361 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
365 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
362 completed=Dict() # completed msg_ids keyed by engine_id
366 completed=Dict() # completed msg_ids keyed by engine_id
363 all_completed=Set() # completed msg_ids keyed by engine_id
367 all_completed=Set() # completed msg_ids keyed by engine_id
364 dead_engines=Set() # completed msg_ids keyed by engine_id
368 dead_engines=Set() # completed msg_ids keyed by engine_id
365 unassigned=Set() # set of task msg_ds not yet assigned a destination
369 unassigned=Set() # set of task msg_ds not yet assigned a destination
366 incoming_registrations=Dict()
370 incoming_registrations=Dict()
367 registration_timeout=Integer()
371 registration_timeout=Integer()
368 _idcounter=Integer(0)
372 _idcounter=Integer(0)
369
373
370 # objects from constructor:
374 # objects from constructor:
371 query=Instance(ZMQStream)
375 query=Instance(ZMQStream)
372 monitor=Instance(ZMQStream)
376 monitor=Instance(ZMQStream)
373 notifier=Instance(ZMQStream)
377 notifier=Instance(ZMQStream)
374 resubmit=Instance(ZMQStream)
378 resubmit=Instance(ZMQStream)
375 heartmonitor=Instance(HeartMonitor)
379 heartmonitor=Instance(HeartMonitor)
376 db=Instance(object)
380 db=Instance(object)
377 client_info=Dict()
381 client_info=Dict()
378 engine_info=Dict()
382 engine_info=Dict()
379
383
380
384
381 def __init__(self, **kwargs):
385 def __init__(self, **kwargs):
382 """
386 """
383 # universal:
387 # universal:
384 loop: IOLoop for creating future connections
388 loop: IOLoop for creating future connections
385 session: streamsession for sending serialized data
389 session: streamsession for sending serialized data
386 # engine:
390 # engine:
387 queue: ZMQStream for monitoring queue messages
391 queue: ZMQStream for monitoring queue messages
388 query: ZMQStream for engine+client registration and client requests
392 query: ZMQStream for engine+client registration and client requests
389 heartbeat: HeartMonitor object for tracking engines
393 heartbeat: HeartMonitor object for tracking engines
390 # extra:
394 # extra:
391 db: ZMQStream for db connection (NotImplemented)
395 db: ZMQStream for db connection (NotImplemented)
392 engine_info: zmq address/protocol dict for engine connections
396 engine_info: zmq address/protocol dict for engine connections
393 client_info: zmq address/protocol dict for client connections
397 client_info: zmq address/protocol dict for client connections
394 """
398 """
395
399
396 super(Hub, self).__init__(**kwargs)
400 super(Hub, self).__init__(**kwargs)
397 self.registration_timeout = max(5000, 2*self.heartmonitor.period)
401 self.registration_timeout = max(5000, 2*self.heartmonitor.period)
398
402
399 # register our callbacks
403 # register our callbacks
400 self.query.on_recv(self.dispatch_query)
404 self.query.on_recv(self.dispatch_query)
401 self.monitor.on_recv(self.dispatch_monitor_traffic)
405 self.monitor.on_recv(self.dispatch_monitor_traffic)
402
406
403 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
407 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
404 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
408 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
405
409
406 self.monitor_handlers = {b'in' : self.save_queue_request,
410 self.monitor_handlers = {b'in' : self.save_queue_request,
407 b'out': self.save_queue_result,
411 b'out': self.save_queue_result,
408 b'intask': self.save_task_request,
412 b'intask': self.save_task_request,
409 b'outtask': self.save_task_result,
413 b'outtask': self.save_task_result,
410 b'tracktask': self.save_task_destination,
414 b'tracktask': self.save_task_destination,
411 b'incontrol': _passer,
415 b'incontrol': _passer,
412 b'outcontrol': _passer,
416 b'outcontrol': _passer,
413 b'iopub': self.save_iopub_message,
417 b'iopub': self.save_iopub_message,
414 }
418 }
415
419
416 self.query_handlers = {'queue_request': self.queue_status,
420 self.query_handlers = {'queue_request': self.queue_status,
417 'result_request': self.get_results,
421 'result_request': self.get_results,
418 'history_request': self.get_history,
422 'history_request': self.get_history,
419 'db_request': self.db_query,
423 'db_request': self.db_query,
420 'purge_request': self.purge_results,
424 'purge_request': self.purge_results,
421 'load_request': self.check_load,
425 'load_request': self.check_load,
422 'resubmit_request': self.resubmit_task,
426 'resubmit_request': self.resubmit_task,
423 'shutdown_request': self.shutdown_request,
427 'shutdown_request': self.shutdown_request,
424 'registration_request' : self.register_engine,
428 'registration_request' : self.register_engine,
425 'unregistration_request' : self.unregister_engine,
429 'unregistration_request' : self.unregister_engine,
426 'connection_request': self.connection_request,
430 'connection_request': self.connection_request,
427 }
431 }
428
432
429 # ignore resubmit replies
433 # ignore resubmit replies
430 self.resubmit.on_recv(lambda msg: None, copy=False)
434 self.resubmit.on_recv(lambda msg: None, copy=False)
431
435
432 self.log.info("hub::created hub")
436 self.log.info("hub::created hub")
433
437
434 @property
438 @property
435 def _next_id(self):
439 def _next_id(self):
436 """gemerate a new ID.
440 """gemerate a new ID.
437
441
438 No longer reuse old ids, just count from 0."""
442 No longer reuse old ids, just count from 0."""
439 newid = self._idcounter
443 newid = self._idcounter
440 self._idcounter += 1
444 self._idcounter += 1
441 return newid
445 return newid
442 # newid = 0
446 # newid = 0
443 # incoming = [id[0] for id in self.incoming_registrations.itervalues()]
447 # incoming = [id[0] for id in self.incoming_registrations.itervalues()]
444 # # print newid, self.ids, self.incoming_registrations
448 # # print newid, self.ids, self.incoming_registrations
445 # while newid in self.ids or newid in incoming:
449 # while newid in self.ids or newid in incoming:
446 # newid += 1
450 # newid += 1
447 # return newid
451 # return newid
448
452
449 #-----------------------------------------------------------------------------
453 #-----------------------------------------------------------------------------
450 # message validation
454 # message validation
451 #-----------------------------------------------------------------------------
455 #-----------------------------------------------------------------------------
452
456
453 def _validate_targets(self, targets):
457 def _validate_targets(self, targets):
454 """turn any valid targets argument into a list of integer ids"""
458 """turn any valid targets argument into a list of integer ids"""
455 if targets is None:
459 if targets is None:
456 # default to all
460 # default to all
457 return self.ids
461 return self.ids
458
462
459 if isinstance(targets, (int,str,unicode)):
463 if isinstance(targets, (int,str,unicode)):
460 # only one target specified
464 # only one target specified
461 targets = [targets]
465 targets = [targets]
462 _targets = []
466 _targets = []
463 for t in targets:
467 for t in targets:
464 # map raw identities to ids
468 # map raw identities to ids
465 if isinstance(t, (str,unicode)):
469 if isinstance(t, (str,unicode)):
466 t = self.by_ident.get(cast_bytes(t), t)
470 t = self.by_ident.get(cast_bytes(t), t)
467 _targets.append(t)
471 _targets.append(t)
468 targets = _targets
472 targets = _targets
469 bad_targets = [ t for t in targets if t not in self.ids ]
473 bad_targets = [ t for t in targets if t not in self.ids ]
470 if bad_targets:
474 if bad_targets:
471 raise IndexError("No Such Engine: %r" % bad_targets)
475 raise IndexError("No Such Engine: %r" % bad_targets)
472 if not targets:
476 if not targets:
473 raise IndexError("No Engines Registered")
477 raise IndexError("No Engines Registered")
474 return targets
478 return targets
475
479
476 #-----------------------------------------------------------------------------
480 #-----------------------------------------------------------------------------
477 # dispatch methods (1 per stream)
481 # dispatch methods (1 per stream)
478 #-----------------------------------------------------------------------------
482 #-----------------------------------------------------------------------------
479
483
480
484
481 @util.log_errors
485 @util.log_errors
482 def dispatch_monitor_traffic(self, msg):
486 def dispatch_monitor_traffic(self, msg):
483 """all ME and Task queue messages come through here, as well as
487 """all ME and Task queue messages come through here, as well as
484 IOPub traffic."""
488 IOPub traffic."""
485 self.log.debug("monitor traffic: %r", msg[0])
489 self.log.debug("monitor traffic: %r", msg[0])
486 switch = msg[0]
490 switch = msg[0]
487 try:
491 try:
488 idents, msg = self.session.feed_identities(msg[1:])
492 idents, msg = self.session.feed_identities(msg[1:])
489 except ValueError:
493 except ValueError:
490 idents=[]
494 idents=[]
491 if not idents:
495 if not idents:
492 self.log.error("Monitor message without topic: %r", msg)
496 self.log.error("Monitor message without topic: %r", msg)
493 return
497 return
494 handler = self.monitor_handlers.get(switch, None)
498 handler = self.monitor_handlers.get(switch, None)
495 if handler is not None:
499 if handler is not None:
496 handler(idents, msg)
500 handler(idents, msg)
497 else:
501 else:
498 self.log.error("Unrecognized monitor topic: %r", switch)
502 self.log.error("Unrecognized monitor topic: %r", switch)
499
503
500
504
501 @util.log_errors
505 @util.log_errors
502 def dispatch_query(self, msg):
506 def dispatch_query(self, msg):
503 """Route registration requests and queries from clients."""
507 """Route registration requests and queries from clients."""
504 try:
508 try:
505 idents, msg = self.session.feed_identities(msg)
509 idents, msg = self.session.feed_identities(msg)
506 except ValueError:
510 except ValueError:
507 idents = []
511 idents = []
508 if not idents:
512 if not idents:
509 self.log.error("Bad Query Message: %r", msg)
513 self.log.error("Bad Query Message: %r", msg)
510 return
514 return
511 client_id = idents[0]
515 client_id = idents[0]
512 try:
516 try:
513 msg = self.session.unserialize(msg, content=True)
517 msg = self.session.unserialize(msg, content=True)
514 except Exception:
518 except Exception:
515 content = error.wrap_exception()
519 content = error.wrap_exception()
516 self.log.error("Bad Query Message: %r", msg, exc_info=True)
520 self.log.error("Bad Query Message: %r", msg, exc_info=True)
517 self.session.send(self.query, "hub_error", ident=client_id,
521 self.session.send(self.query, "hub_error", ident=client_id,
518 content=content)
522 content=content)
519 return
523 return
520 # print client_id, header, parent, content
524 # print client_id, header, parent, content
521 #switch on message type:
525 #switch on message type:
522 msg_type = msg['header']['msg_type']
526 msg_type = msg['header']['msg_type']
523 self.log.info("client::client %r requested %r", client_id, msg_type)
527 self.log.info("client::client %r requested %r", client_id, msg_type)
524 handler = self.query_handlers.get(msg_type, None)
528 handler = self.query_handlers.get(msg_type, None)
525 try:
529 try:
526 assert handler is not None, "Bad Message Type: %r" % msg_type
530 assert handler is not None, "Bad Message Type: %r" % msg_type
527 except:
531 except:
528 content = error.wrap_exception()
532 content = error.wrap_exception()
529 self.log.error("Bad Message Type: %r", msg_type, exc_info=True)
533 self.log.error("Bad Message Type: %r", msg_type, exc_info=True)
530 self.session.send(self.query, "hub_error", ident=client_id,
534 self.session.send(self.query, "hub_error", ident=client_id,
531 content=content)
535 content=content)
532 return
536 return
533
537
534 else:
538 else:
535 handler(idents, msg)
539 handler(idents, msg)
536
540
537 def dispatch_db(self, msg):
541 def dispatch_db(self, msg):
538 """"""
542 """"""
539 raise NotImplementedError
543 raise NotImplementedError
540
544
541 #---------------------------------------------------------------------------
545 #---------------------------------------------------------------------------
542 # handler methods (1 per event)
546 # handler methods (1 per event)
543 #---------------------------------------------------------------------------
547 #---------------------------------------------------------------------------
544
548
545 #----------------------- Heartbeat --------------------------------------
549 #----------------------- Heartbeat --------------------------------------
546
550
547 def handle_new_heart(self, heart):
551 def handle_new_heart(self, heart):
548 """handler to attach to heartbeater.
552 """handler to attach to heartbeater.
549 Called when a new heart starts to beat.
553 Called when a new heart starts to beat.
550 Triggers completion of registration."""
554 Triggers completion of registration."""
551 self.log.debug("heartbeat::handle_new_heart(%r)", heart)
555 self.log.debug("heartbeat::handle_new_heart(%r)", heart)
552 if heart not in self.incoming_registrations:
556 if heart not in self.incoming_registrations:
553 self.log.info("heartbeat::ignoring new heart: %r", heart)
557 self.log.info("heartbeat::ignoring new heart: %r", heart)
554 else:
558 else:
555 self.finish_registration(heart)
559 self.finish_registration(heart)
556
560
557
561
558 def handle_heart_failure(self, heart):
562 def handle_heart_failure(self, heart):
559 """handler to attach to heartbeater.
563 """handler to attach to heartbeater.
560 called when a previously registered heart fails to respond to beat request.
564 called when a previously registered heart fails to respond to beat request.
561 triggers unregistration"""
565 triggers unregistration"""
562 self.log.debug("heartbeat::handle_heart_failure(%r)", heart)
566 self.log.debug("heartbeat::handle_heart_failure(%r)", heart)
563 eid = self.hearts.get(heart, None)
567 eid = self.hearts.get(heart, None)
564 queue = self.engines[eid].queue
568 uuid = self.engines[eid].uuid
565 if eid is None or self.keytable[eid] in self.dead_engines:
569 if eid is None or self.keytable[eid] in self.dead_engines:
566 self.log.info("heartbeat::ignoring heart failure %r (not an engine or already dead)", heart)
570 self.log.info("heartbeat::ignoring heart failure %r (not an engine or already dead)", heart)
567 else:
571 else:
568 self.unregister_engine(heart, dict(content=dict(id=eid, queue=queue)))
572 self.unregister_engine(heart, dict(content=dict(id=eid, queue=uuid)))
569
573
570 #----------------------- MUX Queue Traffic ------------------------------
574 #----------------------- MUX Queue Traffic ------------------------------
571
575
572 def save_queue_request(self, idents, msg):
576 def save_queue_request(self, idents, msg):
573 if len(idents) < 2:
577 if len(idents) < 2:
574 self.log.error("invalid identity prefix: %r", idents)
578 self.log.error("invalid identity prefix: %r", idents)
575 return
579 return
576 queue_id, client_id = idents[:2]
580 queue_id, client_id = idents[:2]
577 try:
581 try:
578 msg = self.session.unserialize(msg)
582 msg = self.session.unserialize(msg)
579 except Exception:
583 except Exception:
580 self.log.error("queue::client %r sent invalid message to %r: %r", client_id, queue_id, msg, exc_info=True)
584 self.log.error("queue::client %r sent invalid message to %r: %r", client_id, queue_id, msg, exc_info=True)
581 return
585 return
582
586
583 eid = self.by_ident.get(queue_id, None)
587 eid = self.by_ident.get(queue_id, None)
584 if eid is None:
588 if eid is None:
585 self.log.error("queue::target %r not registered", queue_id)
589 self.log.error("queue::target %r not registered", queue_id)
586 self.log.debug("queue:: valid are: %r", self.by_ident.keys())
590 self.log.debug("queue:: valid are: %r", self.by_ident.keys())
587 return
591 return
588 record = init_record(msg)
592 record = init_record(msg)
589 msg_id = record['msg_id']
593 msg_id = record['msg_id']
590 self.log.info("queue::client %r submitted request %r to %s", client_id, msg_id, eid)
594 self.log.info("queue::client %r submitted request %r to %s", client_id, msg_id, eid)
591 # Unicode in records
595 # Unicode in records
592 record['engine_uuid'] = queue_id.decode('ascii')
596 record['engine_uuid'] = queue_id.decode('ascii')
593 record['client_uuid'] = msg['header']['session']
597 record['client_uuid'] = msg['header']['session']
594 record['queue'] = 'mux'
598 record['queue'] = 'mux'
595
599
596 try:
600 try:
597 # it's posible iopub arrived first:
601 # it's posible iopub arrived first:
598 existing = self.db.get_record(msg_id)
602 existing = self.db.get_record(msg_id)
599 for key,evalue in existing.iteritems():
603 for key,evalue in existing.iteritems():
600 rvalue = record.get(key, None)
604 rvalue = record.get(key, None)
601 if evalue and rvalue and evalue != rvalue:
605 if evalue and rvalue and evalue != rvalue:
602 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
606 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
603 elif evalue and not rvalue:
607 elif evalue and not rvalue:
604 record[key] = evalue
608 record[key] = evalue
605 try:
609 try:
606 self.db.update_record(msg_id, record)
610 self.db.update_record(msg_id, record)
607 except Exception:
611 except Exception:
608 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
612 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
609 except KeyError:
613 except KeyError:
610 try:
614 try:
611 self.db.add_record(msg_id, record)
615 self.db.add_record(msg_id, record)
612 except Exception:
616 except Exception:
613 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
617 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
614
618
615
619
616 self.pending.add(msg_id)
620 self.pending.add(msg_id)
617 self.queues[eid].append(msg_id)
621 self.queues[eid].append(msg_id)
618
622
619 def save_queue_result(self, idents, msg):
623 def save_queue_result(self, idents, msg):
620 if len(idents) < 2:
624 if len(idents) < 2:
621 self.log.error("invalid identity prefix: %r", idents)
625 self.log.error("invalid identity prefix: %r", idents)
622 return
626 return
623
627
624 client_id, queue_id = idents[:2]
628 client_id, queue_id = idents[:2]
625 try:
629 try:
626 msg = self.session.unserialize(msg)
630 msg = self.session.unserialize(msg)
627 except Exception:
631 except Exception:
628 self.log.error("queue::engine %r sent invalid message to %r: %r",
632 self.log.error("queue::engine %r sent invalid message to %r: %r",
629 queue_id, client_id, msg, exc_info=True)
633 queue_id, client_id, msg, exc_info=True)
630 return
634 return
631
635
632 eid = self.by_ident.get(queue_id, None)
636 eid = self.by_ident.get(queue_id, None)
633 if eid is None:
637 if eid is None:
634 self.log.error("queue::unknown engine %r is sending a reply: ", queue_id)
638 self.log.error("queue::unknown engine %r is sending a reply: ", queue_id)
635 return
639 return
636
640
637 parent = msg['parent_header']
641 parent = msg['parent_header']
638 if not parent:
642 if not parent:
639 return
643 return
640 msg_id = parent['msg_id']
644 msg_id = parent['msg_id']
641 if msg_id in self.pending:
645 if msg_id in self.pending:
642 self.pending.remove(msg_id)
646 self.pending.remove(msg_id)
643 self.all_completed.add(msg_id)
647 self.all_completed.add(msg_id)
644 self.queues[eid].remove(msg_id)
648 self.queues[eid].remove(msg_id)
645 self.completed[eid].append(msg_id)
649 self.completed[eid].append(msg_id)
646 self.log.info("queue::request %r completed on %s", msg_id, eid)
650 self.log.info("queue::request %r completed on %s", msg_id, eid)
647 elif msg_id not in self.all_completed:
651 elif msg_id not in self.all_completed:
648 # it could be a result from a dead engine that died before delivering the
652 # it could be a result from a dead engine that died before delivering the
649 # result
653 # result
650 self.log.warn("queue:: unknown msg finished %r", msg_id)
654 self.log.warn("queue:: unknown msg finished %r", msg_id)
651 return
655 return
652 # update record anyway, because the unregistration could have been premature
656 # update record anyway, because the unregistration could have been premature
653 rheader = msg['header']
657 rheader = msg['header']
654 completed = rheader['date']
658 completed = rheader['date']
655 started = rheader.get('started', None)
659 started = rheader.get('started', None)
656 result = {
660 result = {
657 'result_header' : rheader,
661 'result_header' : rheader,
658 'result_content': msg['content'],
662 'result_content': msg['content'],
659 'received': datetime.now(),
663 'received': datetime.now(),
660 'started' : started,
664 'started' : started,
661 'completed' : completed
665 'completed' : completed
662 }
666 }
663
667
664 result['result_buffers'] = msg['buffers']
668 result['result_buffers'] = msg['buffers']
665 try:
669 try:
666 self.db.update_record(msg_id, result)
670 self.db.update_record(msg_id, result)
667 except Exception:
671 except Exception:
668 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
672 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
669
673
670
674
671 #--------------------- Task Queue Traffic ------------------------------
675 #--------------------- Task Queue Traffic ------------------------------
672
676
673 def save_task_request(self, idents, msg):
677 def save_task_request(self, idents, msg):
674 """Save the submission of a task."""
678 """Save the submission of a task."""
675 client_id = idents[0]
679 client_id = idents[0]
676
680
677 try:
681 try:
678 msg = self.session.unserialize(msg)
682 msg = self.session.unserialize(msg)
679 except Exception:
683 except Exception:
680 self.log.error("task::client %r sent invalid task message: %r",
684 self.log.error("task::client %r sent invalid task message: %r",
681 client_id, msg, exc_info=True)
685 client_id, msg, exc_info=True)
682 return
686 return
683 record = init_record(msg)
687 record = init_record(msg)
684
688
685 record['client_uuid'] = msg['header']['session']
689 record['client_uuid'] = msg['header']['session']
686 record['queue'] = 'task'
690 record['queue'] = 'task'
687 header = msg['header']
691 header = msg['header']
688 msg_id = header['msg_id']
692 msg_id = header['msg_id']
689 self.pending.add(msg_id)
693 self.pending.add(msg_id)
690 self.unassigned.add(msg_id)
694 self.unassigned.add(msg_id)
691 try:
695 try:
692 # it's posible iopub arrived first:
696 # it's posible iopub arrived first:
693 existing = self.db.get_record(msg_id)
697 existing = self.db.get_record(msg_id)
694 if existing['resubmitted']:
698 if existing['resubmitted']:
695 for key in ('submitted', 'client_uuid', 'buffers'):
699 for key in ('submitted', 'client_uuid', 'buffers'):
696 # don't clobber these keys on resubmit
700 # don't clobber these keys on resubmit
697 # submitted and client_uuid should be different
701 # submitted and client_uuid should be different
698 # and buffers might be big, and shouldn't have changed
702 # and buffers might be big, and shouldn't have changed
699 record.pop(key)
703 record.pop(key)
700 # still check content,header which should not change
704 # still check content,header which should not change
701 # but are not expensive to compare as buffers
705 # but are not expensive to compare as buffers
702
706
703 for key,evalue in existing.iteritems():
707 for key,evalue in existing.iteritems():
704 if key.endswith('buffers'):
708 if key.endswith('buffers'):
705 # don't compare buffers
709 # don't compare buffers
706 continue
710 continue
707 rvalue = record.get(key, None)
711 rvalue = record.get(key, None)
708 if evalue and rvalue and evalue != rvalue:
712 if evalue and rvalue and evalue != rvalue:
709 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
713 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
710 elif evalue and not rvalue:
714 elif evalue and not rvalue:
711 record[key] = evalue
715 record[key] = evalue
712 try:
716 try:
713 self.db.update_record(msg_id, record)
717 self.db.update_record(msg_id, record)
714 except Exception:
718 except Exception:
715 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
719 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
716 except KeyError:
720 except KeyError:
717 try:
721 try:
718 self.db.add_record(msg_id, record)
722 self.db.add_record(msg_id, record)
719 except Exception:
723 except Exception:
720 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
724 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
721 except Exception:
725 except Exception:
722 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
726 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
723
727
724 def save_task_result(self, idents, msg):
728 def save_task_result(self, idents, msg):
725 """save the result of a completed task."""
729 """save the result of a completed task."""
726 client_id = idents[0]
730 client_id = idents[0]
727 try:
731 try:
728 msg = self.session.unserialize(msg)
732 msg = self.session.unserialize(msg)
729 except Exception:
733 except Exception:
730 self.log.error("task::invalid task result message send to %r: %r",
734 self.log.error("task::invalid task result message send to %r: %r",
731 client_id, msg, exc_info=True)
735 client_id, msg, exc_info=True)
732 return
736 return
733
737
734 parent = msg['parent_header']
738 parent = msg['parent_header']
735 if not parent:
739 if not parent:
736 # print msg
740 # print msg
737 self.log.warn("Task %r had no parent!", msg)
741 self.log.warn("Task %r had no parent!", msg)
738 return
742 return
739 msg_id = parent['msg_id']
743 msg_id = parent['msg_id']
740 if msg_id in self.unassigned:
744 if msg_id in self.unassigned:
741 self.unassigned.remove(msg_id)
745 self.unassigned.remove(msg_id)
742
746
743 header = msg['header']
747 header = msg['header']
744 engine_uuid = header.get('engine', u'')
748 engine_uuid = header.get('engine', u'')
745 eid = self.by_ident.get(cast_bytes(engine_uuid), None)
749 eid = self.by_ident.get(cast_bytes(engine_uuid), None)
746
750
747 status = header.get('status', None)
751 status = header.get('status', None)
748
752
749 if msg_id in self.pending:
753 if msg_id in self.pending:
750 self.log.info("task::task %r finished on %s", msg_id, eid)
754 self.log.info("task::task %r finished on %s", msg_id, eid)
751 self.pending.remove(msg_id)
755 self.pending.remove(msg_id)
752 self.all_completed.add(msg_id)
756 self.all_completed.add(msg_id)
753 if eid is not None:
757 if eid is not None:
754 if status != 'aborted':
758 if status != 'aborted':
755 self.completed[eid].append(msg_id)
759 self.completed[eid].append(msg_id)
756 if msg_id in self.tasks[eid]:
760 if msg_id in self.tasks[eid]:
757 self.tasks[eid].remove(msg_id)
761 self.tasks[eid].remove(msg_id)
758 completed = header['date']
762 completed = header['date']
759 started = header.get('started', None)
763 started = header.get('started', None)
760 result = {
764 result = {
761 'result_header' : header,
765 'result_header' : header,
762 'result_content': msg['content'],
766 'result_content': msg['content'],
763 'started' : started,
767 'started' : started,
764 'completed' : completed,
768 'completed' : completed,
765 'received' : datetime.now(),
769 'received' : datetime.now(),
766 'engine_uuid': engine_uuid,
770 'engine_uuid': engine_uuid,
767 }
771 }
768
772
769 result['result_buffers'] = msg['buffers']
773 result['result_buffers'] = msg['buffers']
770 try:
774 try:
771 self.db.update_record(msg_id, result)
775 self.db.update_record(msg_id, result)
772 except Exception:
776 except Exception:
773 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
777 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
774
778
775 else:
779 else:
776 self.log.debug("task::unknown task %r finished", msg_id)
780 self.log.debug("task::unknown task %r finished", msg_id)
777
781
778 def save_task_destination(self, idents, msg):
782 def save_task_destination(self, idents, msg):
779 try:
783 try:
780 msg = self.session.unserialize(msg, content=True)
784 msg = self.session.unserialize(msg, content=True)
781 except Exception:
785 except Exception:
782 self.log.error("task::invalid task tracking message", exc_info=True)
786 self.log.error("task::invalid task tracking message", exc_info=True)
783 return
787 return
784 content = msg['content']
788 content = msg['content']
785 # print (content)
789 # print (content)
786 msg_id = content['msg_id']
790 msg_id = content['msg_id']
787 engine_uuid = content['engine_id']
791 engine_uuid = content['engine_id']
788 eid = self.by_ident[cast_bytes(engine_uuid)]
792 eid = self.by_ident[cast_bytes(engine_uuid)]
789
793
790 self.log.info("task::task %r arrived on %r", msg_id, eid)
794 self.log.info("task::task %r arrived on %r", msg_id, eid)
791 if msg_id in self.unassigned:
795 if msg_id in self.unassigned:
792 self.unassigned.remove(msg_id)
796 self.unassigned.remove(msg_id)
793 # else:
797 # else:
794 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
798 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
795
799
796 self.tasks[eid].append(msg_id)
800 self.tasks[eid].append(msg_id)
797 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
801 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
798 try:
802 try:
799 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
803 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
800 except Exception:
804 except Exception:
801 self.log.error("DB Error saving task destination %r", msg_id, exc_info=True)
805 self.log.error("DB Error saving task destination %r", msg_id, exc_info=True)
802
806
803
807
804 def mia_task_request(self, idents, msg):
808 def mia_task_request(self, idents, msg):
805 raise NotImplementedError
809 raise NotImplementedError
806 client_id = idents[0]
810 client_id = idents[0]
807 # content = dict(mia=self.mia,status='ok')
811 # content = dict(mia=self.mia,status='ok')
808 # self.session.send('mia_reply', content=content, idents=client_id)
812 # self.session.send('mia_reply', content=content, idents=client_id)
809
813
810
814
811 #--------------------- IOPub Traffic ------------------------------
815 #--------------------- IOPub Traffic ------------------------------
812
816
813 def save_iopub_message(self, topics, msg):
817 def save_iopub_message(self, topics, msg):
814 """save an iopub message into the db"""
818 """save an iopub message into the db"""
815 # print (topics)
819 # print (topics)
816 try:
820 try:
817 msg = self.session.unserialize(msg, content=True)
821 msg = self.session.unserialize(msg, content=True)
818 except Exception:
822 except Exception:
819 self.log.error("iopub::invalid IOPub message", exc_info=True)
823 self.log.error("iopub::invalid IOPub message", exc_info=True)
820 return
824 return
821
825
822 parent = msg['parent_header']
826 parent = msg['parent_header']
823 if not parent:
827 if not parent:
824 self.log.warn("iopub::IOPub message lacks parent: %r", msg)
828 self.log.warn("iopub::IOPub message lacks parent: %r", msg)
825 return
829 return
826 msg_id = parent['msg_id']
830 msg_id = parent['msg_id']
827 msg_type = msg['header']['msg_type']
831 msg_type = msg['header']['msg_type']
828 content = msg['content']
832 content = msg['content']
829
833
830 # ensure msg_id is in db
834 # ensure msg_id is in db
831 try:
835 try:
832 rec = self.db.get_record(msg_id)
836 rec = self.db.get_record(msg_id)
833 except KeyError:
837 except KeyError:
834 rec = empty_record()
838 rec = empty_record()
835 rec['msg_id'] = msg_id
839 rec['msg_id'] = msg_id
836 self.db.add_record(msg_id, rec)
840 self.db.add_record(msg_id, rec)
837 # stream
841 # stream
838 d = {}
842 d = {}
839 if msg_type == 'stream':
843 if msg_type == 'stream':
840 name = content['name']
844 name = content['name']
841 s = rec[name] or ''
845 s = rec[name] or ''
842 d[name] = s + content['data']
846 d[name] = s + content['data']
843
847
844 elif msg_type == 'pyerr':
848 elif msg_type == 'pyerr':
845 d['pyerr'] = content
849 d['pyerr'] = content
846 elif msg_type == 'pyin':
850 elif msg_type == 'pyin':
847 d['pyin'] = content['code']
851 d['pyin'] = content['code']
848 elif msg_type in ('display_data', 'pyout'):
852 elif msg_type in ('display_data', 'pyout'):
849 d[msg_type] = content
853 d[msg_type] = content
850 elif msg_type == 'status':
854 elif msg_type == 'status':
851 pass
855 pass
852 else:
856 else:
853 self.log.warn("unhandled iopub msg_type: %r", msg_type)
857 self.log.warn("unhandled iopub msg_type: %r", msg_type)
854
858
855 if not d:
859 if not d:
856 return
860 return
857
861
858 try:
862 try:
859 self.db.update_record(msg_id, d)
863 self.db.update_record(msg_id, d)
860 except Exception:
864 except Exception:
861 self.log.error("DB Error saving iopub message %r", msg_id, exc_info=True)
865 self.log.error("DB Error saving iopub message %r", msg_id, exc_info=True)
862
866
863
867
864
868
865 #-------------------------------------------------------------------------
869 #-------------------------------------------------------------------------
866 # Registration requests
870 # Registration requests
867 #-------------------------------------------------------------------------
871 #-------------------------------------------------------------------------
868
872
869 def connection_request(self, client_id, msg):
873 def connection_request(self, client_id, msg):
870 """Reply with connection addresses for clients."""
874 """Reply with connection addresses for clients."""
871 self.log.info("client::client %r connected", client_id)
875 self.log.info("client::client %r connected", client_id)
872 content = dict(status='ok')
876 content = dict(status='ok')
873 jsonable = {}
877 jsonable = {}
874 for k,v in self.keytable.iteritems():
878 for k,v in self.keytable.iteritems():
875 if v not in self.dead_engines:
879 if v not in self.dead_engines:
876 jsonable[str(k)] = v.decode('ascii')
880 jsonable[str(k)] = v
877 content['engines'] = jsonable
881 content['engines'] = jsonable
878 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
882 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
879
883
880 def register_engine(self, reg, msg):
884 def register_engine(self, reg, msg):
881 """Register a new engine."""
885 """Register a new engine."""
882 content = msg['content']
886 content = msg['content']
883 try:
887 try:
884 queue = cast_bytes(content['queue'])
888 uuid = content['uuid']
885 except KeyError:
889 except KeyError:
886 self.log.error("registration::queue not specified", exc_info=True)
890 self.log.error("registration::queue not specified", exc_info=True)
887 return
891 return
888 heart = content.get('heartbeat', None)
892
889 if heart:
890 heart = cast_bytes(heart)
891 """register a new engine, and create the socket(s) necessary"""
892 eid = self._next_id
893 eid = self._next_id
893 # print (eid, queue, reg, heart)
894
894
895 self.log.debug("registration::register_engine(%i, %r, %r, %r)", eid, queue, reg, heart)
895 self.log.debug("registration::register_engine(%i, %r)", eid, uuid)
896
896
897 content = dict(id=eid,status='ok')
897 content = dict(id=eid,status='ok')
898 # check if requesting available IDs:
898 # check if requesting available IDs:
899 if queue in self.by_ident:
899 if uuid in self.by_ident:
900 try:
900 try:
901 raise KeyError("queue_id %r in use" % queue)
901 raise KeyError("uuid %r in use" % uuid)
902 except:
902 except:
903 content = error.wrap_exception()
903 content = error.wrap_exception()
904 self.log.error("queue_id %r in use", queue, exc_info=True)
904 self.log.error("uuid %r in use", uuid, exc_info=True)
905 elif heart in self.hearts: # need to check unique hearts?
906 try:
907 raise KeyError("heart_id %r in use" % heart)
908 except:
909 self.log.error("heart_id %r in use", heart, exc_info=True)
910 content = error.wrap_exception()
911 else:
905 else:
912 for h, pack in self.incoming_registrations.iteritems():
906 for h, ec in self.incoming_registrations.iteritems():
913 if heart == h:
907 if uuid == h:
914 try:
908 try:
915 raise KeyError("heart_id %r in use" % heart)
909 raise KeyError("heart_id %r in use" % uuid)
916 except:
910 except:
917 self.log.error("heart_id %r in use", heart, exc_info=True)
911 self.log.error("heart_id %r in use", uuid, exc_info=True)
918 content = error.wrap_exception()
912 content = error.wrap_exception()
919 break
913 break
920 elif queue == pack[1]:
914 elif uuid == ec.uuid:
921 try:
915 try:
922 raise KeyError("queue_id %r in use" % queue)
916 raise KeyError("uuid %r in use" % uuid)
923 except:
917 except:
924 self.log.error("queue_id %r in use", queue, exc_info=True)
918 self.log.error("uuid %r in use", uuid, exc_info=True)
925 content = error.wrap_exception()
919 content = error.wrap_exception()
926 break
920 break
927
921
928 msg = self.session.send(self.query, "registration_reply",
922 msg = self.session.send(self.query, "registration_reply",
929 content=content,
923 content=content,
930 ident=reg)
924 ident=reg)
931
925
926 heart = util.asbytes(uuid)
927
932 if content['status'] == 'ok':
928 if content['status'] == 'ok':
933 if heart in self.heartmonitor.hearts:
929 if heart in self.heartmonitor.hearts:
934 # already beating
930 # already beating
935 self.incoming_registrations[heart] = (eid,queue,reg[0],None)
931 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid)
936 self.finish_registration(heart)
932 self.finish_registration(heart)
937 else:
933 else:
938 purge = lambda : self._purge_stalled_registration(heart)
934 purge = lambda : self._purge_stalled_registration(heart)
939 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
935 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
940 dc.start()
936 dc.start()
941 self.incoming_registrations[heart] = (eid,queue,reg[0],dc)
937 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid,stallback=dc)
942 else:
938 else:
943 self.log.error("registration::registration %i failed: %r", eid, content['evalue'])
939 self.log.error("registration::registration %i failed: %r", eid, content['evalue'])
940
944 return eid
941 return eid
945
942
946 def unregister_engine(self, ident, msg):
943 def unregister_engine(self, ident, msg):
947 """Unregister an engine that explicitly requested to leave."""
944 """Unregister an engine that explicitly requested to leave."""
948 try:
945 try:
949 eid = msg['content']['id']
946 eid = msg['content']['id']
950 except:
947 except:
951 self.log.error("registration::bad engine id for unregistration: %r", ident, exc_info=True)
948 self.log.error("registration::bad engine id for unregistration: %r", ident, exc_info=True)
952 return
949 return
953 self.log.info("registration::unregister_engine(%r)", eid)
950 self.log.info("registration::unregister_engine(%r)", eid)
954 # print (eid)
951 # print (eid)
955 uuid = self.keytable[eid]
952 uuid = self.keytable[eid]
956 content=dict(id=eid, queue=uuid.decode('ascii'))
953 content=dict(id=eid, uuid=uuid)
957 self.dead_engines.add(uuid)
954 self.dead_engines.add(uuid)
958 # self.ids.remove(eid)
955 # self.ids.remove(eid)
959 # uuid = self.keytable.pop(eid)
956 # uuid = self.keytable.pop(eid)
960 #
957 #
961 # ec = self.engines.pop(eid)
958 # ec = self.engines.pop(eid)
962 # self.hearts.pop(ec.heartbeat)
959 # self.hearts.pop(ec.heartbeat)
963 # self.by_ident.pop(ec.queue)
960 # self.by_ident.pop(ec.queue)
964 # self.completed.pop(eid)
961 # self.completed.pop(eid)
965 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
962 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
966 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
963 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
967 dc.start()
964 dc.start()
968 ############## TODO: HANDLE IT ################
965 ############## TODO: HANDLE IT ################
969
966
967 self._save_engine_state()
968
970 if self.notifier:
969 if self.notifier:
971 self.session.send(self.notifier, "unregistration_notification", content=content)
970 self.session.send(self.notifier, "unregistration_notification", content=content)
972
971
973 def _handle_stranded_msgs(self, eid, uuid):
972 def _handle_stranded_msgs(self, eid, uuid):
974 """Handle messages known to be on an engine when the engine unregisters.
973 """Handle messages known to be on an engine when the engine unregisters.
975
974
976 It is possible that this will fire prematurely - that is, an engine will
975 It is possible that this will fire prematurely - that is, an engine will
977 go down after completing a result, and the client will be notified
976 go down after completing a result, and the client will be notified
978 that the result failed and later receive the actual result.
977 that the result failed and later receive the actual result.
979 """
978 """
980
979
981 outstanding = self.queues[eid]
980 outstanding = self.queues[eid]
982
981
983 for msg_id in outstanding:
982 for msg_id in outstanding:
984 self.pending.remove(msg_id)
983 self.pending.remove(msg_id)
985 self.all_completed.add(msg_id)
984 self.all_completed.add(msg_id)
986 try:
985 try:
987 raise error.EngineError("Engine %r died while running task %r" % (eid, msg_id))
986 raise error.EngineError("Engine %r died while running task %r" % (eid, msg_id))
988 except:
987 except:
989 content = error.wrap_exception()
988 content = error.wrap_exception()
990 # build a fake header:
989 # build a fake header:
991 header = {}
990 header = {}
992 header['engine'] = uuid
991 header['engine'] = uuid
993 header['date'] = datetime.now()
992 header['date'] = datetime.now()
994 rec = dict(result_content=content, result_header=header, result_buffers=[])
993 rec = dict(result_content=content, result_header=header, result_buffers=[])
995 rec['completed'] = header['date']
994 rec['completed'] = header['date']
996 rec['engine_uuid'] = uuid
995 rec['engine_uuid'] = uuid
997 try:
996 try:
998 self.db.update_record(msg_id, rec)
997 self.db.update_record(msg_id, rec)
999 except Exception:
998 except Exception:
1000 self.log.error("DB Error handling stranded msg %r", msg_id, exc_info=True)
999 self.log.error("DB Error handling stranded msg %r", msg_id, exc_info=True)
1001
1000
1002
1001
1003 def finish_registration(self, heart):
1002 def finish_registration(self, heart):
1004 """Second half of engine registration, called after our HeartMonitor
1003 """Second half of engine registration, called after our HeartMonitor
1005 has received a beat from the Engine's Heart."""
1004 has received a beat from the Engine's Heart."""
1006 try:
1005 try:
1007 (eid,queue,reg,purge) = self.incoming_registrations.pop(heart)
1006 ec = self.incoming_registrations.pop(heart)
1008 except KeyError:
1007 except KeyError:
1009 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
1008 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
1010 return
1009 return
1011 self.log.info("registration::finished registering engine %i:%r", eid, queue)
1010 self.log.info("registration::finished registering engine %i:%s", ec.id, ec.uuid)
1012 if purge is not None:
1011 if ec.stallback is not None:
1013 purge.stop()
1012 ec.stallback.stop()
1014 control = queue
1013 eid = ec.id
1015 self.ids.add(eid)
1014 self.ids.add(eid)
1016 self.keytable[eid] = queue
1015 self.keytable[eid] = ec.uuid
1017 self.engines[eid] = EngineConnector(id=eid, queue=queue, registration=reg,
1016 self.engines[eid] = ec
1018 control=control, heartbeat=heart)
1017 self.by_ident[ec.uuid] = ec.id
1019 self.by_ident[queue] = eid
1020 self.queues[eid] = list()
1018 self.queues[eid] = list()
1021 self.tasks[eid] = list()
1019 self.tasks[eid] = list()
1022 self.completed[eid] = list()
1020 self.completed[eid] = list()
1023 self.hearts[heart] = eid
1021 self.hearts[heart] = eid
1024 content = dict(id=eid, queue=self.engines[eid].queue.decode('ascii'))
1022 content = dict(id=eid, uuid=self.engines[eid].uuid)
1025 if self.notifier:
1023 if self.notifier:
1026 self.session.send(self.notifier, "registration_notification", content=content)
1024 self.session.send(self.notifier, "registration_notification", content=content)
1027 self.log.info("engine::Engine Connected: %i", eid)
1025 self.log.info("engine::Engine Connected: %i", eid)
1028
1026
1027 self._save_engine_state()
1028
1029 def _purge_stalled_registration(self, heart):
1029 def _purge_stalled_registration(self, heart):
1030 if heart in self.incoming_registrations:
1030 if heart in self.incoming_registrations:
1031 eid = self.incoming_registrations.pop(heart)[0]
1031 ec = self.incoming_registrations.pop(heart)
1032 self.log.info("registration::purging stalled registration: %i", eid)
1032 self.log.info("registration::purging stalled registration: %i", ec.id)
1033 else:
1033 else:
1034 pass
1034 pass
1035
1035
1036 #-------------------------------------------------------------------------
1036 #-------------------------------------------------------------------------
1037 # Engine State
1038 #-------------------------------------------------------------------------
1039
1040
1041 def _cleanup_engine_state_file(self):
1042 """cleanup engine state mapping"""
1043
1044 if os.path.exists(self.engine_state_file):
1045 self.log.debug("cleaning up engine state: %s", self.engine_state_file)
1046 try:
1047 os.remove(self.engine_state_file)
1048 except IOError:
1049 self.log.error("Couldn't cleanup file: %s", self.engine_state_file, exc_info=True)
1050
1051
1052 def _save_engine_state(self):
1053 """save engine mapping to JSON file"""
1054 if not self.engine_state_file:
1055 return
1056 self.log.debug("save engine state to %s" % self.engine_state_file)
1057 state = {}
1058 engines = {}
1059 for eid, ec in self.engines.iteritems():
1060 if ec.uuid not in self.dead_engines:
1061 engines[eid] = ec.uuid
1062
1063 state['engines'] = engines
1064
1065 state['next_id'] = self._idcounter
1066
1067 with open(self.engine_state_file, 'w') as f:
1068 json.dump(state, f)
1069
1070
1071 def _load_engine_state(self):
1072 """load engine mapping from JSON file"""
1073 if not os.path.exists(self.engine_state_file):
1074 return
1075
1076 self.log.info("loading engine state from %s" % self.engine_state_file)
1077
1078 with open(self.engine_state_file) as f:
1079 state = json.load(f)
1080
1081 save_notifier = self.notifier
1082 self.notifier = None
1083 for eid, uuid in state['engines'].iteritems():
1084 heart = uuid.encode('ascii')
1085 # start with this heart as current and beating:
1086 self.heartmonitor.responses.add(heart)
1087 self.heartmonitor.hearts.add(heart)
1088
1089 self.incoming_registrations[heart] = EngineConnector(id=int(eid), uuid=uuid)
1090 self.finish_registration(heart)
1091
1092 self.notifier = save_notifier
1093
1094 self._idcounter = state['next_id']
1095
1096 #-------------------------------------------------------------------------
1037 # Client Requests
1097 # Client Requests
1038 #-------------------------------------------------------------------------
1098 #-------------------------------------------------------------------------
1039
1099
1040 def shutdown_request(self, client_id, msg):
1100 def shutdown_request(self, client_id, msg):
1041 """handle shutdown request."""
1101 """handle shutdown request."""
1042 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1102 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1043 # also notify other clients of shutdown
1103 # also notify other clients of shutdown
1044 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1104 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1045 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1105 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1046 dc.start()
1106 dc.start()
1047
1107
1048 def _shutdown(self):
1108 def _shutdown(self):
1049 self.log.info("hub::hub shutting down.")
1109 self.log.info("hub::hub shutting down.")
1050 time.sleep(0.1)
1110 time.sleep(0.1)
1051 sys.exit(0)
1111 sys.exit(0)
1052
1112
1053
1113
1054 def check_load(self, client_id, msg):
1114 def check_load(self, client_id, msg):
1055 content = msg['content']
1115 content = msg['content']
1056 try:
1116 try:
1057 targets = content['targets']
1117 targets = content['targets']
1058 targets = self._validate_targets(targets)
1118 targets = self._validate_targets(targets)
1059 except:
1119 except:
1060 content = error.wrap_exception()
1120 content = error.wrap_exception()
1061 self.session.send(self.query, "hub_error",
1121 self.session.send(self.query, "hub_error",
1062 content=content, ident=client_id)
1122 content=content, ident=client_id)
1063 return
1123 return
1064
1124
1065 content = dict(status='ok')
1125 content = dict(status='ok')
1066 # loads = {}
1126 # loads = {}
1067 for t in targets:
1127 for t in targets:
1068 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1128 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1069 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1129 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1070
1130
1071
1131
1072 def queue_status(self, client_id, msg):
1132 def queue_status(self, client_id, msg):
1073 """Return the Queue status of one or more targets.
1133 """Return the Queue status of one or more targets.
1074 if verbose: return the msg_ids
1134 if verbose: return the msg_ids
1075 else: return len of each type.
1135 else: return len of each type.
1076 keys: queue (pending MUX jobs)
1136 keys: queue (pending MUX jobs)
1077 tasks (pending Task jobs)
1137 tasks (pending Task jobs)
1078 completed (finished jobs from both queues)"""
1138 completed (finished jobs from both queues)"""
1079 content = msg['content']
1139 content = msg['content']
1080 targets = content['targets']
1140 targets = content['targets']
1081 try:
1141 try:
1082 targets = self._validate_targets(targets)
1142 targets = self._validate_targets(targets)
1083 except:
1143 except:
1084 content = error.wrap_exception()
1144 content = error.wrap_exception()
1085 self.session.send(self.query, "hub_error",
1145 self.session.send(self.query, "hub_error",
1086 content=content, ident=client_id)
1146 content=content, ident=client_id)
1087 return
1147 return
1088 verbose = content.get('verbose', False)
1148 verbose = content.get('verbose', False)
1089 content = dict(status='ok')
1149 content = dict(status='ok')
1090 for t in targets:
1150 for t in targets:
1091 queue = self.queues[t]
1151 queue = self.queues[t]
1092 completed = self.completed[t]
1152 completed = self.completed[t]
1093 tasks = self.tasks[t]
1153 tasks = self.tasks[t]
1094 if not verbose:
1154 if not verbose:
1095 queue = len(queue)
1155 queue = len(queue)
1096 completed = len(completed)
1156 completed = len(completed)
1097 tasks = len(tasks)
1157 tasks = len(tasks)
1098 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1158 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1099 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1159 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1100 # print (content)
1160 # print (content)
1101 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1161 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1102
1162
1103 def purge_results(self, client_id, msg):
1163 def purge_results(self, client_id, msg):
1104 """Purge results from memory. This method is more valuable before we move
1164 """Purge results from memory. This method is more valuable before we move
1105 to a DB based message storage mechanism."""
1165 to a DB based message storage mechanism."""
1106 content = msg['content']
1166 content = msg['content']
1107 self.log.info("Dropping records with %s", content)
1167 self.log.info("Dropping records with %s", content)
1108 msg_ids = content.get('msg_ids', [])
1168 msg_ids = content.get('msg_ids', [])
1109 reply = dict(status='ok')
1169 reply = dict(status='ok')
1110 if msg_ids == 'all':
1170 if msg_ids == 'all':
1111 try:
1171 try:
1112 self.db.drop_matching_records(dict(completed={'$ne':None}))
1172 self.db.drop_matching_records(dict(completed={'$ne':None}))
1113 except Exception:
1173 except Exception:
1114 reply = error.wrap_exception()
1174 reply = error.wrap_exception()
1115 else:
1175 else:
1116 pending = filter(lambda m: m in self.pending, msg_ids)
1176 pending = filter(lambda m: m in self.pending, msg_ids)
1117 if pending:
1177 if pending:
1118 try:
1178 try:
1119 raise IndexError("msg pending: %r" % pending[0])
1179 raise IndexError("msg pending: %r" % pending[0])
1120 except:
1180 except:
1121 reply = error.wrap_exception()
1181 reply = error.wrap_exception()
1122 else:
1182 else:
1123 try:
1183 try:
1124 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1184 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1125 except Exception:
1185 except Exception:
1126 reply = error.wrap_exception()
1186 reply = error.wrap_exception()
1127
1187
1128 if reply['status'] == 'ok':
1188 if reply['status'] == 'ok':
1129 eids = content.get('engine_ids', [])
1189 eids = content.get('engine_ids', [])
1130 for eid in eids:
1190 for eid in eids:
1131 if eid not in self.engines:
1191 if eid not in self.engines:
1132 try:
1192 try:
1133 raise IndexError("No such engine: %i" % eid)
1193 raise IndexError("No such engine: %i" % eid)
1134 except:
1194 except:
1135 reply = error.wrap_exception()
1195 reply = error.wrap_exception()
1136 break
1196 break
1137 uid = self.engines[eid].queue
1197 uid = self.engines[eid].uuid
1138 try:
1198 try:
1139 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1199 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1140 except Exception:
1200 except Exception:
1141 reply = error.wrap_exception()
1201 reply = error.wrap_exception()
1142 break
1202 break
1143
1203
1144 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1204 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1145
1205
1146 def resubmit_task(self, client_id, msg):
1206 def resubmit_task(self, client_id, msg):
1147 """Resubmit one or more tasks."""
1207 """Resubmit one or more tasks."""
1148 def finish(reply):
1208 def finish(reply):
1149 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1209 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1150
1210
1151 content = msg['content']
1211 content = msg['content']
1152 msg_ids = content['msg_ids']
1212 msg_ids = content['msg_ids']
1153 reply = dict(status='ok')
1213 reply = dict(status='ok')
1154 try:
1214 try:
1155 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1215 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1156 'header', 'content', 'buffers'])
1216 'header', 'content', 'buffers'])
1157 except Exception:
1217 except Exception:
1158 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1218 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1159 return finish(error.wrap_exception())
1219 return finish(error.wrap_exception())
1160
1220
1161 # validate msg_ids
1221 # validate msg_ids
1162 found_ids = [ rec['msg_id'] for rec in records ]
1222 found_ids = [ rec['msg_id'] for rec in records ]
1163 pending_ids = [ msg_id for msg_id in found_ids if msg_id in self.pending ]
1223 pending_ids = [ msg_id for msg_id in found_ids if msg_id in self.pending ]
1164 if len(records) > len(msg_ids):
1224 if len(records) > len(msg_ids):
1165 try:
1225 try:
1166 raise RuntimeError("DB appears to be in an inconsistent state."
1226 raise RuntimeError("DB appears to be in an inconsistent state."
1167 "More matching records were found than should exist")
1227 "More matching records were found than should exist")
1168 except Exception:
1228 except Exception:
1169 return finish(error.wrap_exception())
1229 return finish(error.wrap_exception())
1170 elif len(records) < len(msg_ids):
1230 elif len(records) < len(msg_ids):
1171 missing = [ m for m in msg_ids if m not in found_ids ]
1231 missing = [ m for m in msg_ids if m not in found_ids ]
1172 try:
1232 try:
1173 raise KeyError("No such msg(s): %r" % missing)
1233 raise KeyError("No such msg(s): %r" % missing)
1174 except KeyError:
1234 except KeyError:
1175 return finish(error.wrap_exception())
1235 return finish(error.wrap_exception())
1176 elif pending_ids:
1236 elif pending_ids:
1177 pass
1237 pass
1178 # no need to raise on resubmit of pending task, now that we
1238 # no need to raise on resubmit of pending task, now that we
1179 # resubmit under new ID, but do we want to raise anyway?
1239 # resubmit under new ID, but do we want to raise anyway?
1180 # msg_id = invalid_ids[0]
1240 # msg_id = invalid_ids[0]
1181 # try:
1241 # try:
1182 # raise ValueError("Task(s) %r appears to be inflight" % )
1242 # raise ValueError("Task(s) %r appears to be inflight" % )
1183 # except Exception:
1243 # except Exception:
1184 # return finish(error.wrap_exception())
1244 # return finish(error.wrap_exception())
1185
1245
1186 # mapping of original IDs to resubmitted IDs
1246 # mapping of original IDs to resubmitted IDs
1187 resubmitted = {}
1247 resubmitted = {}
1188
1248
1189 # send the messages
1249 # send the messages
1190 for rec in records:
1250 for rec in records:
1191 header = rec['header']
1251 header = rec['header']
1192 msg = self.session.msg(header['msg_type'], parent=header)
1252 msg = self.session.msg(header['msg_type'], parent=header)
1193 msg_id = msg['msg_id']
1253 msg_id = msg['msg_id']
1194 msg['content'] = rec['content']
1254 msg['content'] = rec['content']
1195
1255
1196 # use the old header, but update msg_id and timestamp
1256 # use the old header, but update msg_id and timestamp
1197 fresh = msg['header']
1257 fresh = msg['header']
1198 header['msg_id'] = fresh['msg_id']
1258 header['msg_id'] = fresh['msg_id']
1199 header['date'] = fresh['date']
1259 header['date'] = fresh['date']
1200 msg['header'] = header
1260 msg['header'] = header
1201
1261
1202 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1262 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1203
1263
1204 resubmitted[rec['msg_id']] = msg_id
1264 resubmitted[rec['msg_id']] = msg_id
1205 self.pending.add(msg_id)
1265 self.pending.add(msg_id)
1206 msg['buffers'] = rec['buffers']
1266 msg['buffers'] = rec['buffers']
1207 try:
1267 try:
1208 self.db.add_record(msg_id, init_record(msg))
1268 self.db.add_record(msg_id, init_record(msg))
1209 except Exception:
1269 except Exception:
1210 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1270 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1211 return finish(error.wrap_exception())
1271 return finish(error.wrap_exception())
1212
1272
1213 finish(dict(status='ok', resubmitted=resubmitted))
1273 finish(dict(status='ok', resubmitted=resubmitted))
1214
1274
1215 # store the new IDs in the Task DB
1275 # store the new IDs in the Task DB
1216 for msg_id, resubmit_id in resubmitted.iteritems():
1276 for msg_id, resubmit_id in resubmitted.iteritems():
1217 try:
1277 try:
1218 self.db.update_record(msg_id, {'resubmitted' : resubmit_id})
1278 self.db.update_record(msg_id, {'resubmitted' : resubmit_id})
1219 except Exception:
1279 except Exception:
1220 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1280 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1221
1281
1222
1282
1223 def _extract_record(self, rec):
1283 def _extract_record(self, rec):
1224 """decompose a TaskRecord dict into subsection of reply for get_result"""
1284 """decompose a TaskRecord dict into subsection of reply for get_result"""
1225 io_dict = {}
1285 io_dict = {}
1226 for key in ('pyin', 'pyout', 'pyerr', 'stdout', 'stderr'):
1286 for key in ('pyin', 'pyout', 'pyerr', 'stdout', 'stderr'):
1227 io_dict[key] = rec[key]
1287 io_dict[key] = rec[key]
1228 content = { 'result_content': rec['result_content'],
1288 content = { 'result_content': rec['result_content'],
1229 'header': rec['header'],
1289 'header': rec['header'],
1230 'result_header' : rec['result_header'],
1290 'result_header' : rec['result_header'],
1231 'received' : rec['received'],
1291 'received' : rec['received'],
1232 'io' : io_dict,
1292 'io' : io_dict,
1233 }
1293 }
1234 if rec['result_buffers']:
1294 if rec['result_buffers']:
1235 buffers = map(bytes, rec['result_buffers'])
1295 buffers = map(bytes, rec['result_buffers'])
1236 else:
1296 else:
1237 buffers = []
1297 buffers = []
1238
1298
1239 return content, buffers
1299 return content, buffers
1240
1300
1241 def get_results(self, client_id, msg):
1301 def get_results(self, client_id, msg):
1242 """Get the result of 1 or more messages."""
1302 """Get the result of 1 or more messages."""
1243 content = msg['content']
1303 content = msg['content']
1244 msg_ids = sorted(set(content['msg_ids']))
1304 msg_ids = sorted(set(content['msg_ids']))
1245 statusonly = content.get('status_only', False)
1305 statusonly = content.get('status_only', False)
1246 pending = []
1306 pending = []
1247 completed = []
1307 completed = []
1248 content = dict(status='ok')
1308 content = dict(status='ok')
1249 content['pending'] = pending
1309 content['pending'] = pending
1250 content['completed'] = completed
1310 content['completed'] = completed
1251 buffers = []
1311 buffers = []
1252 if not statusonly:
1312 if not statusonly:
1253 try:
1313 try:
1254 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1314 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1255 # turn match list into dict, for faster lookup
1315 # turn match list into dict, for faster lookup
1256 records = {}
1316 records = {}
1257 for rec in matches:
1317 for rec in matches:
1258 records[rec['msg_id']] = rec
1318 records[rec['msg_id']] = rec
1259 except Exception:
1319 except Exception:
1260 content = error.wrap_exception()
1320 content = error.wrap_exception()
1261 self.session.send(self.query, "result_reply", content=content,
1321 self.session.send(self.query, "result_reply", content=content,
1262 parent=msg, ident=client_id)
1322 parent=msg, ident=client_id)
1263 return
1323 return
1264 else:
1324 else:
1265 records = {}
1325 records = {}
1266 for msg_id in msg_ids:
1326 for msg_id in msg_ids:
1267 if msg_id in self.pending:
1327 if msg_id in self.pending:
1268 pending.append(msg_id)
1328 pending.append(msg_id)
1269 elif msg_id in self.all_completed:
1329 elif msg_id in self.all_completed:
1270 completed.append(msg_id)
1330 completed.append(msg_id)
1271 if not statusonly:
1331 if not statusonly:
1272 c,bufs = self._extract_record(records[msg_id])
1332 c,bufs = self._extract_record(records[msg_id])
1273 content[msg_id] = c
1333 content[msg_id] = c
1274 buffers.extend(bufs)
1334 buffers.extend(bufs)
1275 elif msg_id in records:
1335 elif msg_id in records:
1276 if rec['completed']:
1336 if rec['completed']:
1277 completed.append(msg_id)
1337 completed.append(msg_id)
1278 c,bufs = self._extract_record(records[msg_id])
1338 c,bufs = self._extract_record(records[msg_id])
1279 content[msg_id] = c
1339 content[msg_id] = c
1280 buffers.extend(bufs)
1340 buffers.extend(bufs)
1281 else:
1341 else:
1282 pending.append(msg_id)
1342 pending.append(msg_id)
1283 else:
1343 else:
1284 try:
1344 try:
1285 raise KeyError('No such message: '+msg_id)
1345 raise KeyError('No such message: '+msg_id)
1286 except:
1346 except:
1287 content = error.wrap_exception()
1347 content = error.wrap_exception()
1288 break
1348 break
1289 self.session.send(self.query, "result_reply", content=content,
1349 self.session.send(self.query, "result_reply", content=content,
1290 parent=msg, ident=client_id,
1350 parent=msg, ident=client_id,
1291 buffers=buffers)
1351 buffers=buffers)
1292
1352
1293 def get_history(self, client_id, msg):
1353 def get_history(self, client_id, msg):
1294 """Get a list of all msg_ids in our DB records"""
1354 """Get a list of all msg_ids in our DB records"""
1295 try:
1355 try:
1296 msg_ids = self.db.get_history()
1356 msg_ids = self.db.get_history()
1297 except Exception as e:
1357 except Exception as e:
1298 content = error.wrap_exception()
1358 content = error.wrap_exception()
1299 else:
1359 else:
1300 content = dict(status='ok', history=msg_ids)
1360 content = dict(status='ok', history=msg_ids)
1301
1361
1302 self.session.send(self.query, "history_reply", content=content,
1362 self.session.send(self.query, "history_reply", content=content,
1303 parent=msg, ident=client_id)
1363 parent=msg, ident=client_id)
1304
1364
1305 def db_query(self, client_id, msg):
1365 def db_query(self, client_id, msg):
1306 """Perform a raw query on the task record database."""
1366 """Perform a raw query on the task record database."""
1307 content = msg['content']
1367 content = msg['content']
1308 query = content.get('query', {})
1368 query = content.get('query', {})
1309 keys = content.get('keys', None)
1369 keys = content.get('keys', None)
1310 buffers = []
1370 buffers = []
1311 empty = list()
1371 empty = list()
1312 try:
1372 try:
1313 records = self.db.find_records(query, keys)
1373 records = self.db.find_records(query, keys)
1314 except Exception as e:
1374 except Exception as e:
1315 content = error.wrap_exception()
1375 content = error.wrap_exception()
1316 else:
1376 else:
1317 # extract buffers from reply content:
1377 # extract buffers from reply content:
1318 if keys is not None:
1378 if keys is not None:
1319 buffer_lens = [] if 'buffers' in keys else None
1379 buffer_lens = [] if 'buffers' in keys else None
1320 result_buffer_lens = [] if 'result_buffers' in keys else None
1380 result_buffer_lens = [] if 'result_buffers' in keys else None
1321 else:
1381 else:
1322 buffer_lens = None
1382 buffer_lens = None
1323 result_buffer_lens = None
1383 result_buffer_lens = None
1324
1384
1325 for rec in records:
1385 for rec in records:
1326 # buffers may be None, so double check
1386 # buffers may be None, so double check
1327 b = rec.pop('buffers', empty) or empty
1387 b = rec.pop('buffers', empty) or empty
1328 if buffer_lens is not None:
1388 if buffer_lens is not None:
1329 buffer_lens.append(len(b))
1389 buffer_lens.append(len(b))
1330 buffers.extend(b)
1390 buffers.extend(b)
1331 rb = rec.pop('result_buffers', empty) or empty
1391 rb = rec.pop('result_buffers', empty) or empty
1332 if result_buffer_lens is not None:
1392 if result_buffer_lens is not None:
1333 result_buffer_lens.append(len(rb))
1393 result_buffer_lens.append(len(rb))
1334 buffers.extend(rb)
1394 buffers.extend(rb)
1335 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1395 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1336 result_buffer_lens=result_buffer_lens)
1396 result_buffer_lens=result_buffer_lens)
1337 # self.log.debug (content)
1397 # self.log.debug (content)
1338 self.session.send(self.query, "db_reply", content=content,
1398 self.session.send(self.query, "db_reply", content=content,
1339 parent=msg, ident=client_id,
1399 parent=msg, ident=client_id,
1340 buffers=buffers)
1400 buffers=buffers)
1341
1401
@@ -1,768 +1,794 b''
1 """The Python scheduler for rich scheduling.
1 """The Python scheduler for rich scheduling.
2
2
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
5 Python Scheduler exists.
5 Python Scheduler exists.
6
6
7 Authors:
7 Authors:
8
8
9 * Min RK
9 * Min RK
10 """
10 """
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12 # Copyright (C) 2010-2011 The IPython Development Team
12 # Copyright (C) 2010-2011 The IPython Development Team
13 #
13 #
14 # Distributed under the terms of the BSD License. The full license is in
14 # Distributed under the terms of the BSD License. The full license is in
15 # the file COPYING, distributed as part of this software.
15 # the file COPYING, distributed as part of this software.
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17
17
18 #----------------------------------------------------------------------
18 #----------------------------------------------------------------------
19 # Imports
19 # Imports
20 #----------------------------------------------------------------------
20 #----------------------------------------------------------------------
21
21
22 from __future__ import print_function
22 from __future__ import print_function
23
23
24 import logging
24 import logging
25 import sys
25 import sys
26 import time
26 import time
27
27
28 from datetime import datetime, timedelta
28 from datetime import datetime, timedelta
29 from random import randint, random
29 from random import randint, random
30 from types import FunctionType
30 from types import FunctionType
31
31
32 try:
32 try:
33 import numpy
33 import numpy
34 except ImportError:
34 except ImportError:
35 numpy = None
35 numpy = None
36
36
37 import zmq
37 import zmq
38 from zmq.eventloop import ioloop, zmqstream
38 from zmq.eventloop import ioloop, zmqstream
39
39
40 # local imports
40 # local imports
41 from IPython.external.decorator import decorator
41 from IPython.external.decorator import decorator
42 from IPython.config.application import Application
42 from IPython.config.application import Application
43 from IPython.config.loader import Config
43 from IPython.config.loader import Config
44 from IPython.utils.traitlets import Instance, Dict, List, Set, Integer, Enum, CBytes
44 from IPython.utils.traitlets import Instance, Dict, List, Set, Integer, Enum, CBytes
45 from IPython.utils.py3compat import cast_bytes
45 from IPython.utils.py3compat import cast_bytes
46
46
47 from IPython.parallel import error, util
47 from IPython.parallel import error, util
48 from IPython.parallel.factory import SessionFactory
48 from IPython.parallel.factory import SessionFactory
49 from IPython.parallel.util import connect_logger, local_logger
49 from IPython.parallel.util import connect_logger, local_logger
50
50
51 from .dependency import Dependency
51 from .dependency import Dependency
52
52
53 @decorator
53 @decorator
54 def logged(f,self,*args,**kwargs):
54 def logged(f,self,*args,**kwargs):
55 # print ("#--------------------")
55 # print ("#--------------------")
56 self.log.debug("scheduler::%s(*%s,**%s)", f.func_name, args, kwargs)
56 self.log.debug("scheduler::%s(*%s,**%s)", f.func_name, args, kwargs)
57 # print ("#--")
57 # print ("#--")
58 return f(self,*args, **kwargs)
58 return f(self,*args, **kwargs)
59
59
60 #----------------------------------------------------------------------
60 #----------------------------------------------------------------------
61 # Chooser functions
61 # Chooser functions
62 #----------------------------------------------------------------------
62 #----------------------------------------------------------------------
63
63
64 def plainrandom(loads):
64 def plainrandom(loads):
65 """Plain random pick."""
65 """Plain random pick."""
66 n = len(loads)
66 n = len(loads)
67 return randint(0,n-1)
67 return randint(0,n-1)
68
68
69 def lru(loads):
69 def lru(loads):
70 """Always pick the front of the line.
70 """Always pick the front of the line.
71
71
72 The content of `loads` is ignored.
72 The content of `loads` is ignored.
73
73
74 Assumes LRU ordering of loads, with oldest first.
74 Assumes LRU ordering of loads, with oldest first.
75 """
75 """
76 return 0
76 return 0
77
77
78 def twobin(loads):
78 def twobin(loads):
79 """Pick two at random, use the LRU of the two.
79 """Pick two at random, use the LRU of the two.
80
80
81 The content of loads is ignored.
81 The content of loads is ignored.
82
82
83 Assumes LRU ordering of loads, with oldest first.
83 Assumes LRU ordering of loads, with oldest first.
84 """
84 """
85 n = len(loads)
85 n = len(loads)
86 a = randint(0,n-1)
86 a = randint(0,n-1)
87 b = randint(0,n-1)
87 b = randint(0,n-1)
88 return min(a,b)
88 return min(a,b)
89
89
90 def weighted(loads):
90 def weighted(loads):
91 """Pick two at random using inverse load as weight.
91 """Pick two at random using inverse load as weight.
92
92
93 Return the less loaded of the two.
93 Return the less loaded of the two.
94 """
94 """
95 # weight 0 a million times more than 1:
95 # weight 0 a million times more than 1:
96 weights = 1./(1e-6+numpy.array(loads))
96 weights = 1./(1e-6+numpy.array(loads))
97 sums = weights.cumsum()
97 sums = weights.cumsum()
98 t = sums[-1]
98 t = sums[-1]
99 x = random()*t
99 x = random()*t
100 y = random()*t
100 y = random()*t
101 idx = 0
101 idx = 0
102 idy = 0
102 idy = 0
103 while sums[idx] < x:
103 while sums[idx] < x:
104 idx += 1
104 idx += 1
105 while sums[idy] < y:
105 while sums[idy] < y:
106 idy += 1
106 idy += 1
107 if weights[idy] > weights[idx]:
107 if weights[idy] > weights[idx]:
108 return idy
108 return idy
109 else:
109 else:
110 return idx
110 return idx
111
111
112 def leastload(loads):
112 def leastload(loads):
113 """Always choose the lowest load.
113 """Always choose the lowest load.
114
114
115 If the lowest load occurs more than once, the first
115 If the lowest load occurs more than once, the first
116 occurance will be used. If loads has LRU ordering, this means
116 occurance will be used. If loads has LRU ordering, this means
117 the LRU of those with the lowest load is chosen.
117 the LRU of those with the lowest load is chosen.
118 """
118 """
119 return loads.index(min(loads))
119 return loads.index(min(loads))
120
120
121 #---------------------------------------------------------------------
121 #---------------------------------------------------------------------
122 # Classes
122 # Classes
123 #---------------------------------------------------------------------
123 #---------------------------------------------------------------------
124
124
125
125
126 # store empty default dependency:
126 # store empty default dependency:
127 MET = Dependency([])
127 MET = Dependency([])
128
128
129
129
130 class Job(object):
130 class Job(object):
131 """Simple container for a job"""
131 """Simple container for a job"""
132 def __init__(self, msg_id, raw_msg, idents, msg, header, targets, after, follow, timeout):
132 def __init__(self, msg_id, raw_msg, idents, msg, header, targets, after, follow, timeout):
133 self.msg_id = msg_id
133 self.msg_id = msg_id
134 self.raw_msg = raw_msg
134 self.raw_msg = raw_msg
135 self.idents = idents
135 self.idents = idents
136 self.msg = msg
136 self.msg = msg
137 self.header = header
137 self.header = header
138 self.targets = targets
138 self.targets = targets
139 self.after = after
139 self.after = after
140 self.follow = follow
140 self.follow = follow
141 self.timeout = timeout
141 self.timeout = timeout
142
142
143
143
144 self.timestamp = time.time()
144 self.timestamp = time.time()
145 self.blacklist = set()
145 self.blacklist = set()
146
146
147 @property
147 @property
148 def dependents(self):
148 def dependents(self):
149 return self.follow.union(self.after)
149 return self.follow.union(self.after)
150
150
151 class TaskScheduler(SessionFactory):
151 class TaskScheduler(SessionFactory):
152 """Python TaskScheduler object.
152 """Python TaskScheduler object.
153
153
154 This is the simplest object that supports msg_id based
154 This is the simplest object that supports msg_id based
155 DAG dependencies. *Only* task msg_ids are checked, not
155 DAG dependencies. *Only* task msg_ids are checked, not
156 msg_ids of jobs submitted via the MUX queue.
156 msg_ids of jobs submitted via the MUX queue.
157
157
158 """
158 """
159
159
160 hwm = Integer(1, config=True,
160 hwm = Integer(1, config=True,
161 help="""specify the High Water Mark (HWM) for the downstream
161 help="""specify the High Water Mark (HWM) for the downstream
162 socket in the Task scheduler. This is the maximum number
162 socket in the Task scheduler. This is the maximum number
163 of allowed outstanding tasks on each engine.
163 of allowed outstanding tasks on each engine.
164
164
165 The default (1) means that only one task can be outstanding on each
165 The default (1) means that only one task can be outstanding on each
166 engine. Setting TaskScheduler.hwm=0 means there is no limit, and the
166 engine. Setting TaskScheduler.hwm=0 means there is no limit, and the
167 engines continue to be assigned tasks while they are working,
167 engines continue to be assigned tasks while they are working,
168 effectively hiding network latency behind computation, but can result
168 effectively hiding network latency behind computation, but can result
169 in an imbalance of work when submitting many heterogenous tasks all at
169 in an imbalance of work when submitting many heterogenous tasks all at
170 once. Any positive value greater than one is a compromise between the
170 once. Any positive value greater than one is a compromise between the
171 two.
171 two.
172
172
173 """
173 """
174 )
174 )
175 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
175 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
176 'leastload', config=True, allow_none=False,
176 'leastload', config=True, allow_none=False,
177 help="""select the task scheduler scheme [default: Python LRU]
177 help="""select the task scheduler scheme [default: Python LRU]
178 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
178 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
179 )
179 )
180 def _scheme_name_changed(self, old, new):
180 def _scheme_name_changed(self, old, new):
181 self.log.debug("Using scheme %r"%new)
181 self.log.debug("Using scheme %r"%new)
182 self.scheme = globals()[new]
182 self.scheme = globals()[new]
183
183
184 # input arguments:
184 # input arguments:
185 scheme = Instance(FunctionType) # function for determining the destination
185 scheme = Instance(FunctionType) # function for determining the destination
186 def _scheme_default(self):
186 def _scheme_default(self):
187 return leastload
187 return leastload
188 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
188 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
189 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
189 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
190 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
190 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
191 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
191 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
192 query_stream = Instance(zmqstream.ZMQStream) # hub-facing DEALER stream
192
193
193 # internals:
194 # internals:
194 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
195 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
195 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
196 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
196 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
197 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
197 depending = Dict() # dict by msg_id of Jobs
198 depending = Dict() # dict by msg_id of Jobs
198 pending = Dict() # dict by engine_uuid of submitted tasks
199 pending = Dict() # dict by engine_uuid of submitted tasks
199 completed = Dict() # dict by engine_uuid of completed tasks
200 completed = Dict() # dict by engine_uuid of completed tasks
200 failed = Dict() # dict by engine_uuid of failed tasks
201 failed = Dict() # dict by engine_uuid of failed tasks
201 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
202 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
202 clients = Dict() # dict by msg_id for who submitted the task
203 clients = Dict() # dict by msg_id for who submitted the task
203 targets = List() # list of target IDENTs
204 targets = List() # list of target IDENTs
204 loads = List() # list of engine loads
205 loads = List() # list of engine loads
205 # full = Set() # set of IDENTs that have HWM outstanding tasks
206 # full = Set() # set of IDENTs that have HWM outstanding tasks
206 all_completed = Set() # set of all completed tasks
207 all_completed = Set() # set of all completed tasks
207 all_failed = Set() # set of all failed tasks
208 all_failed = Set() # set of all failed tasks
208 all_done = Set() # set of all finished tasks=union(completed,failed)
209 all_done = Set() # set of all finished tasks=union(completed,failed)
209 all_ids = Set() # set of all submitted task IDs
210 all_ids = Set() # set of all submitted task IDs
210
211
211 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
212 auditor = Instance('zmq.eventloop.ioloop.PeriodicCallback')
212
213
213 ident = CBytes() # ZMQ identity. This should just be self.session.session
214 ident = CBytes() # ZMQ identity. This should just be self.session.session
214 # but ensure Bytes
215 # but ensure Bytes
215 def _ident_default(self):
216 def _ident_default(self):
216 return self.session.bsession
217 return self.session.bsession
217
218
218 def start(self):
219 def start(self):
220 self.query_stream.on_recv(self.dispatch_query_reply)
221 self.session.send(self.query_stream, "connection_request", {})
222
219 self.engine_stream.on_recv(self.dispatch_result, copy=False)
223 self.engine_stream.on_recv(self.dispatch_result, copy=False)
220 self.client_stream.on_recv(self.dispatch_submission, copy=False)
224 self.client_stream.on_recv(self.dispatch_submission, copy=False)
221
225
222 self._notification_handlers = dict(
226 self._notification_handlers = dict(
223 registration_notification = self._register_engine,
227 registration_notification = self._register_engine,
224 unregistration_notification = self._unregister_engine
228 unregistration_notification = self._unregister_engine
225 )
229 )
226 self.notifier_stream.on_recv(self.dispatch_notification)
230 self.notifier_stream.on_recv(self.dispatch_notification)
227 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
231 self.auditor = ioloop.PeriodicCallback(self.audit_timeouts, 2e3, self.loop) # 1 Hz
228 self.auditor.start()
232 self.auditor.start()
229 self.log.info("Scheduler started [%s]"%self.scheme_name)
233 self.log.info("Scheduler started [%s]"%self.scheme_name)
230
234
231 def resume_receiving(self):
235 def resume_receiving(self):
232 """Resume accepting jobs."""
236 """Resume accepting jobs."""
233 self.client_stream.on_recv(self.dispatch_submission, copy=False)
237 self.client_stream.on_recv(self.dispatch_submission, copy=False)
234
238
235 def stop_receiving(self):
239 def stop_receiving(self):
236 """Stop accepting jobs while there are no engines.
240 """Stop accepting jobs while there are no engines.
237 Leave them in the ZMQ queue."""
241 Leave them in the ZMQ queue."""
238 self.client_stream.on_recv(None)
242 self.client_stream.on_recv(None)
239
243
240 #-----------------------------------------------------------------------
244 #-----------------------------------------------------------------------
241 # [Un]Registration Handling
245 # [Un]Registration Handling
242 #-----------------------------------------------------------------------
246 #-----------------------------------------------------------------------
243
247
244
248
249 def dispatch_query_reply(self, msg):
250 """handle reply to our initial connection request"""
251 try:
252 idents,msg = self.session.feed_identities(msg)
253 except ValueError:
254 self.log.warn("task::Invalid Message: %r",msg)
255 return
256 try:
257 msg = self.session.unserialize(msg)
258 except ValueError:
259 self.log.warn("task::Unauthorized message from: %r"%idents)
260 return
261
262 content = msg['content']
263 for uuid in content.get('engines', {}).values():
264 self._register_engine(asbytes(uuid))
265
266
245 @util.log_errors
267 @util.log_errors
246 def dispatch_notification(self, msg):
268 def dispatch_notification(self, msg):
247 """dispatch register/unregister events."""
269 """dispatch register/unregister events."""
248 try:
270 try:
249 idents,msg = self.session.feed_identities(msg)
271 idents,msg = self.session.feed_identities(msg)
250 except ValueError:
272 except ValueError:
251 self.log.warn("task::Invalid Message: %r",msg)
273 self.log.warn("task::Invalid Message: %r",msg)
252 return
274 return
253 try:
275 try:
254 msg = self.session.unserialize(msg)
276 msg = self.session.unserialize(msg)
255 except ValueError:
277 except ValueError:
256 self.log.warn("task::Unauthorized message from: %r"%idents)
278 self.log.warn("task::Unauthorized message from: %r"%idents)
257 return
279 return
258
280
259 msg_type = msg['header']['msg_type']
281 msg_type = msg['header']['msg_type']
260
282
261 handler = self._notification_handlers.get(msg_type, None)
283 handler = self._notification_handlers.get(msg_type, None)
262 if handler is None:
284 if handler is None:
263 self.log.error("Unhandled message type: %r"%msg_type)
285 self.log.error("Unhandled message type: %r"%msg_type)
264 else:
286 else:
265 try:
287 try:
266 handler(cast_bytes(msg['content']['queue']))
288 handler(cast_bytes(msg['content']['uuid']))
267 except Exception:
289 except Exception:
268 self.log.error("task::Invalid notification msg: %r", msg, exc_info=True)
290 self.log.error("task::Invalid notification msg: %r", msg, exc_info=True)
269
291
270 def _register_engine(self, uid):
292 def _register_engine(self, uid):
271 """New engine with ident `uid` became available."""
293 """New engine with ident `uid` became available."""
272 # head of the line:
294 # head of the line:
273 self.targets.insert(0,uid)
295 self.targets.insert(0,uid)
274 self.loads.insert(0,0)
296 self.loads.insert(0,0)
275
297
276 # initialize sets
298 # initialize sets
277 self.completed[uid] = set()
299 self.completed[uid] = set()
278 self.failed[uid] = set()
300 self.failed[uid] = set()
279 self.pending[uid] = {}
301 self.pending[uid] = {}
280
302
281 # rescan the graph:
303 # rescan the graph:
282 self.update_graph(None)
304 self.update_graph(None)
283
305
284 def _unregister_engine(self, uid):
306 def _unregister_engine(self, uid):
285 """Existing engine with ident `uid` became unavailable."""
307 """Existing engine with ident `uid` became unavailable."""
286 if len(self.targets) == 1:
308 if len(self.targets) == 1:
287 # this was our only engine
309 # this was our only engine
288 pass
310 pass
289
311
290 # handle any potentially finished tasks:
312 # handle any potentially finished tasks:
291 self.engine_stream.flush()
313 self.engine_stream.flush()
292
314
293 # don't pop destinations, because they might be used later
315 # don't pop destinations, because they might be used later
294 # map(self.destinations.pop, self.completed.pop(uid))
316 # map(self.destinations.pop, self.completed.pop(uid))
295 # map(self.destinations.pop, self.failed.pop(uid))
317 # map(self.destinations.pop, self.failed.pop(uid))
296
318
297 # prevent this engine from receiving work
319 # prevent this engine from receiving work
298 idx = self.targets.index(uid)
320 idx = self.targets.index(uid)
299 self.targets.pop(idx)
321 self.targets.pop(idx)
300 self.loads.pop(idx)
322 self.loads.pop(idx)
301
323
302 # wait 5 seconds before cleaning up pending jobs, since the results might
324 # wait 5 seconds before cleaning up pending jobs, since the results might
303 # still be incoming
325 # still be incoming
304 if self.pending[uid]:
326 if self.pending[uid]:
305 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
327 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
306 dc.start()
328 dc.start()
307 else:
329 else:
308 self.completed.pop(uid)
330 self.completed.pop(uid)
309 self.failed.pop(uid)
331 self.failed.pop(uid)
310
332
311
333
312 def handle_stranded_tasks(self, engine):
334 def handle_stranded_tasks(self, engine):
313 """Deal with jobs resident in an engine that died."""
335 """Deal with jobs resident in an engine that died."""
314 lost = self.pending[engine]
336 lost = self.pending[engine]
315 for msg_id in lost.keys():
337 for msg_id in lost.keys():
316 if msg_id not in self.pending[engine]:
338 if msg_id not in self.pending[engine]:
317 # prevent double-handling of messages
339 # prevent double-handling of messages
318 continue
340 continue
319
341
320 raw_msg = lost[msg_id].raw_msg
342 raw_msg = lost[msg_id].raw_msg
321 idents,msg = self.session.feed_identities(raw_msg, copy=False)
343 idents,msg = self.session.feed_identities(raw_msg, copy=False)
322 parent = self.session.unpack(msg[1].bytes)
344 parent = self.session.unpack(msg[1].bytes)
323 idents = [engine, idents[0]]
345 idents = [engine, idents[0]]
324
346
325 # build fake error reply
347 # build fake error reply
326 try:
348 try:
327 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
349 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
328 except:
350 except:
329 content = error.wrap_exception()
351 content = error.wrap_exception()
330 # build fake header
352 # build fake header
331 header = dict(
353 header = dict(
332 status='error',
354 status='error',
333 engine=engine,
355 engine=engine,
334 date=datetime.now(),
356 date=datetime.now(),
335 )
357 )
336 msg = self.session.msg('apply_reply', content, parent=parent, subheader=header)
358 msg = self.session.msg('apply_reply', content, parent=parent, subheader=header)
337 raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
359 raw_reply = map(zmq.Message, self.session.serialize(msg, ident=idents))
338 # and dispatch it
360 # and dispatch it
339 self.dispatch_result(raw_reply)
361 self.dispatch_result(raw_reply)
340
362
341 # finally scrub completed/failed lists
363 # finally scrub completed/failed lists
342 self.completed.pop(engine)
364 self.completed.pop(engine)
343 self.failed.pop(engine)
365 self.failed.pop(engine)
344
366
345
367
346 #-----------------------------------------------------------------------
368 #-----------------------------------------------------------------------
347 # Job Submission
369 # Job Submission
348 #-----------------------------------------------------------------------
370 #-----------------------------------------------------------------------
349
371
350
372
351 @util.log_errors
373 @util.log_errors
352 def dispatch_submission(self, raw_msg):
374 def dispatch_submission(self, raw_msg):
353 """Dispatch job submission to appropriate handlers."""
375 """Dispatch job submission to appropriate handlers."""
354 # ensure targets up to date:
376 # ensure targets up to date:
355 self.notifier_stream.flush()
377 self.notifier_stream.flush()
356 try:
378 try:
357 idents, msg = self.session.feed_identities(raw_msg, copy=False)
379 idents, msg = self.session.feed_identities(raw_msg, copy=False)
358 msg = self.session.unserialize(msg, content=False, copy=False)
380 msg = self.session.unserialize(msg, content=False, copy=False)
359 except Exception:
381 except Exception:
360 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
382 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
361 return
383 return
362
384
363
385
364 # send to monitor
386 # send to monitor
365 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
387 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
366
388
367 header = msg['header']
389 header = msg['header']
368 msg_id = header['msg_id']
390 msg_id = header['msg_id']
369 self.all_ids.add(msg_id)
391 self.all_ids.add(msg_id)
370
392
371 # get targets as a set of bytes objects
393 # get targets as a set of bytes objects
372 # from a list of unicode objects
394 # from a list of unicode objects
373 targets = header.get('targets', [])
395 targets = header.get('targets', [])
374 targets = map(cast_bytes, targets)
396 targets = map(cast_bytes, targets)
375 targets = set(targets)
397 targets = set(targets)
376
398
377 retries = header.get('retries', 0)
399 retries = header.get('retries', 0)
378 self.retries[msg_id] = retries
400 self.retries[msg_id] = retries
379
401
380 # time dependencies
402 # time dependencies
381 after = header.get('after', None)
403 after = header.get('after', None)
382 if after:
404 if after:
383 after = Dependency(after)
405 after = Dependency(after)
384 if after.all:
406 if after.all:
385 if after.success:
407 if after.success:
386 after = Dependency(after.difference(self.all_completed),
408 after = Dependency(after.difference(self.all_completed),
387 success=after.success,
409 success=after.success,
388 failure=after.failure,
410 failure=after.failure,
389 all=after.all,
411 all=after.all,
390 )
412 )
391 if after.failure:
413 if after.failure:
392 after = Dependency(after.difference(self.all_failed),
414 after = Dependency(after.difference(self.all_failed),
393 success=after.success,
415 success=after.success,
394 failure=after.failure,
416 failure=after.failure,
395 all=after.all,
417 all=after.all,
396 )
418 )
397 if after.check(self.all_completed, self.all_failed):
419 if after.check(self.all_completed, self.all_failed):
398 # recast as empty set, if `after` already met,
420 # recast as empty set, if `after` already met,
399 # to prevent unnecessary set comparisons
421 # to prevent unnecessary set comparisons
400 after = MET
422 after = MET
401 else:
423 else:
402 after = MET
424 after = MET
403
425
404 # location dependencies
426 # location dependencies
405 follow = Dependency(header.get('follow', []))
427 follow = Dependency(header.get('follow', []))
406
428
407 # turn timeouts into datetime objects:
429 # turn timeouts into datetime objects:
408 timeout = header.get('timeout', None)
430 timeout = header.get('timeout', None)
409 if timeout:
431 if timeout:
410 # cast to float, because jsonlib returns floats as decimal.Decimal,
432 # cast to float, because jsonlib returns floats as decimal.Decimal,
411 # which timedelta does not accept
433 # which timedelta does not accept
412 timeout = datetime.now() + timedelta(0,float(timeout),0)
434 timeout = datetime.now() + timedelta(0,float(timeout),0)
413
435
414 job = Job(msg_id=msg_id, raw_msg=raw_msg, idents=idents, msg=msg,
436 job = Job(msg_id=msg_id, raw_msg=raw_msg, idents=idents, msg=msg,
415 header=header, targets=targets, after=after, follow=follow,
437 header=header, targets=targets, after=after, follow=follow,
416 timeout=timeout,
438 timeout=timeout,
417 )
439 )
418
440
419 # validate and reduce dependencies:
441 # validate and reduce dependencies:
420 for dep in after,follow:
442 for dep in after,follow:
421 if not dep: # empty dependency
443 if not dep: # empty dependency
422 continue
444 continue
423 # check valid:
445 # check valid:
424 if msg_id in dep or dep.difference(self.all_ids):
446 if msg_id in dep or dep.difference(self.all_ids):
425 self.depending[msg_id] = job
447 self.depending[msg_id] = job
426 return self.fail_unreachable(msg_id, error.InvalidDependency)
448 return self.fail_unreachable(msg_id, error.InvalidDependency)
427 # check if unreachable:
449 # check if unreachable:
428 if dep.unreachable(self.all_completed, self.all_failed):
450 if dep.unreachable(self.all_completed, self.all_failed):
429 self.depending[msg_id] = job
451 self.depending[msg_id] = job
430 return self.fail_unreachable(msg_id)
452 return self.fail_unreachable(msg_id)
431
453
432 if after.check(self.all_completed, self.all_failed):
454 if after.check(self.all_completed, self.all_failed):
433 # time deps already met, try to run
455 # time deps already met, try to run
434 if not self.maybe_run(job):
456 if not self.maybe_run(job):
435 # can't run yet
457 # can't run yet
436 if msg_id not in self.all_failed:
458 if msg_id not in self.all_failed:
437 # could have failed as unreachable
459 # could have failed as unreachable
438 self.save_unmet(job)
460 self.save_unmet(job)
439 else:
461 else:
440 self.save_unmet(job)
462 self.save_unmet(job)
441
463
442 def audit_timeouts(self):
464 def audit_timeouts(self):
443 """Audit all waiting tasks for expired timeouts."""
465 """Audit all waiting tasks for expired timeouts."""
444 now = datetime.now()
466 now = datetime.now()
445 for msg_id in self.depending.keys():
467 for msg_id in self.depending.keys():
446 # must recheck, in case one failure cascaded to another:
468 # must recheck, in case one failure cascaded to another:
447 if msg_id in self.depending:
469 if msg_id in self.depending:
448 job = self.depending[msg_id]
470 job = self.depending[msg_id]
449 if job.timeout and job.timeout < now:
471 if job.timeout and job.timeout < now:
450 self.fail_unreachable(msg_id, error.TaskTimeout)
472 self.fail_unreachable(msg_id, error.TaskTimeout)
451
473
452 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
474 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
453 """a task has become unreachable, send a reply with an ImpossibleDependency
475 """a task has become unreachable, send a reply with an ImpossibleDependency
454 error."""
476 error."""
455 if msg_id not in self.depending:
477 if msg_id not in self.depending:
456 self.log.error("msg %r already failed!", msg_id)
478 self.log.error("msg %r already failed!", msg_id)
457 return
479 return
458 job = self.depending.pop(msg_id)
480 job = self.depending.pop(msg_id)
459 for mid in job.dependents:
481 for mid in job.dependents:
460 if mid in self.graph:
482 if mid in self.graph:
461 self.graph[mid].remove(msg_id)
483 self.graph[mid].remove(msg_id)
462
484
463 try:
485 try:
464 raise why()
486 raise why()
465 except:
487 except:
466 content = error.wrap_exception()
488 content = error.wrap_exception()
467
489
468 self.all_done.add(msg_id)
490 self.all_done.add(msg_id)
469 self.all_failed.add(msg_id)
491 self.all_failed.add(msg_id)
470
492
471 msg = self.session.send(self.client_stream, 'apply_reply', content,
493 msg = self.session.send(self.client_stream, 'apply_reply', content,
472 parent=job.header, ident=job.idents)
494 parent=job.header, ident=job.idents)
473 self.session.send(self.mon_stream, msg, ident=[b'outtask']+job.idents)
495 self.session.send(self.mon_stream, msg, ident=[b'outtask']+job.idents)
474
496
475 self.update_graph(msg_id, success=False)
497 self.update_graph(msg_id, success=False)
476
498
477 def maybe_run(self, job):
499 def maybe_run(self, job):
478 """check location dependencies, and run if they are met."""
500 """check location dependencies, and run if they are met."""
479 msg_id = job.msg_id
501 msg_id = job.msg_id
480 self.log.debug("Attempting to assign task %s", msg_id)
502 self.log.debug("Attempting to assign task %s", msg_id)
481 if not self.targets:
503 if not self.targets:
482 # no engines, definitely can't run
504 # no engines, definitely can't run
483 return False
505 return False
484
506
485 if job.follow or job.targets or job.blacklist or self.hwm:
507 if job.follow or job.targets or job.blacklist or self.hwm:
486 # we need a can_run filter
508 # we need a can_run filter
487 def can_run(idx):
509 def can_run(idx):
488 # check hwm
510 # check hwm
489 if self.hwm and self.loads[idx] == self.hwm:
511 if self.hwm and self.loads[idx] == self.hwm:
490 return False
512 return False
491 target = self.targets[idx]
513 target = self.targets[idx]
492 # check blacklist
514 # check blacklist
493 if target in job.blacklist:
515 if target in job.blacklist:
494 return False
516 return False
495 # check targets
517 # check targets
496 if job.targets and target not in job.targets:
518 if job.targets and target not in job.targets:
497 return False
519 return False
498 # check follow
520 # check follow
499 return job.follow.check(self.completed[target], self.failed[target])
521 return job.follow.check(self.completed[target], self.failed[target])
500
522
501 indices = filter(can_run, range(len(self.targets)))
523 indices = filter(can_run, range(len(self.targets)))
502
524
503 if not indices:
525 if not indices:
504 # couldn't run
526 # couldn't run
505 if job.follow.all:
527 if job.follow.all:
506 # check follow for impossibility
528 # check follow for impossibility
507 dests = set()
529 dests = set()
508 relevant = set()
530 relevant = set()
509 if job.follow.success:
531 if job.follow.success:
510 relevant = self.all_completed
532 relevant = self.all_completed
511 if job.follow.failure:
533 if job.follow.failure:
512 relevant = relevant.union(self.all_failed)
534 relevant = relevant.union(self.all_failed)
513 for m in job.follow.intersection(relevant):
535 for m in job.follow.intersection(relevant):
514 dests.add(self.destinations[m])
536 dests.add(self.destinations[m])
515 if len(dests) > 1:
537 if len(dests) > 1:
516 self.depending[msg_id] = job
538 self.depending[msg_id] = job
517 self.fail_unreachable(msg_id)
539 self.fail_unreachable(msg_id)
518 return False
540 return False
519 if job.targets:
541 if job.targets:
520 # check blacklist+targets for impossibility
542 # check blacklist+targets for impossibility
521 job.targets.difference_update(job.blacklist)
543 job.targets.difference_update(job.blacklist)
522 if not job.targets or not job.targets.intersection(self.targets):
544 if not job.targets or not job.targets.intersection(self.targets):
523 self.depending[msg_id] = job
545 self.depending[msg_id] = job
524 self.fail_unreachable(msg_id)
546 self.fail_unreachable(msg_id)
525 return False
547 return False
526 return False
548 return False
527 else:
549 else:
528 indices = None
550 indices = None
529
551
530 self.submit_task(job, indices)
552 self.submit_task(job, indices)
531 return True
553 return True
532
554
533 def save_unmet(self, job):
555 def save_unmet(self, job):
534 """Save a message for later submission when its dependencies are met."""
556 """Save a message for later submission when its dependencies are met."""
535 msg_id = job.msg_id
557 msg_id = job.msg_id
536 self.depending[msg_id] = job
558 self.depending[msg_id] = job
537 # track the ids in follow or after, but not those already finished
559 # track the ids in follow or after, but not those already finished
538 for dep_id in job.after.union(job.follow).difference(self.all_done):
560 for dep_id in job.after.union(job.follow).difference(self.all_done):
539 if dep_id not in self.graph:
561 if dep_id not in self.graph:
540 self.graph[dep_id] = set()
562 self.graph[dep_id] = set()
541 self.graph[dep_id].add(msg_id)
563 self.graph[dep_id].add(msg_id)
542
564
543 def submit_task(self, job, indices=None):
565 def submit_task(self, job, indices=None):
544 """Submit a task to any of a subset of our targets."""
566 """Submit a task to any of a subset of our targets."""
545 if indices:
567 if indices:
546 loads = [self.loads[i] for i in indices]
568 loads = [self.loads[i] for i in indices]
547 else:
569 else:
548 loads = self.loads
570 loads = self.loads
549 idx = self.scheme(loads)
571 idx = self.scheme(loads)
550 if indices:
572 if indices:
551 idx = indices[idx]
573 idx = indices[idx]
552 target = self.targets[idx]
574 target = self.targets[idx]
553 # print (target, map(str, msg[:3]))
575 # print (target, map(str, msg[:3]))
554 # send job to the engine
576 # send job to the engine
555 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
577 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
556 self.engine_stream.send_multipart(job.raw_msg, copy=False)
578 self.engine_stream.send_multipart(job.raw_msg, copy=False)
557 # update load
579 # update load
558 self.add_job(idx)
580 self.add_job(idx)
559 self.pending[target][job.msg_id] = job
581 self.pending[target][job.msg_id] = job
560 # notify Hub
582 # notify Hub
561 content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii'))
583 content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii'))
562 self.session.send(self.mon_stream, 'task_destination', content=content,
584 self.session.send(self.mon_stream, 'task_destination', content=content,
563 ident=[b'tracktask',self.ident])
585 ident=[b'tracktask',self.ident])
564
586
565
587
566 #-----------------------------------------------------------------------
588 #-----------------------------------------------------------------------
567 # Result Handling
589 # Result Handling
568 #-----------------------------------------------------------------------
590 #-----------------------------------------------------------------------
569
591
570
592
571 @util.log_errors
593 @util.log_errors
572 def dispatch_result(self, raw_msg):
594 def dispatch_result(self, raw_msg):
573 """dispatch method for result replies"""
595 """dispatch method for result replies"""
574 try:
596 try:
575 idents,msg = self.session.feed_identities(raw_msg, copy=False)
597 idents,msg = self.session.feed_identities(raw_msg, copy=False)
576 msg = self.session.unserialize(msg, content=False, copy=False)
598 msg = self.session.unserialize(msg, content=False, copy=False)
577 engine = idents[0]
599 engine = idents[0]
578 try:
600 try:
579 idx = self.targets.index(engine)
601 idx = self.targets.index(engine)
580 except ValueError:
602 except ValueError:
581 pass # skip load-update for dead engines
603 pass # skip load-update for dead engines
582 else:
604 else:
583 self.finish_job(idx)
605 self.finish_job(idx)
584 except Exception:
606 except Exception:
585 self.log.error("task::Invaid result: %r", raw_msg, exc_info=True)
607 self.log.error("task::Invaid result: %r", raw_msg, exc_info=True)
586 return
608 return
587
609
588 header = msg['header']
610 header = msg['header']
589 parent = msg['parent_header']
611 parent = msg['parent_header']
590 if header.get('dependencies_met', True):
612 if header.get('dependencies_met', True):
591 success = (header['status'] == 'ok')
613 success = (header['status'] == 'ok')
592 msg_id = parent['msg_id']
614 msg_id = parent['msg_id']
593 retries = self.retries[msg_id]
615 retries = self.retries[msg_id]
594 if not success and retries > 0:
616 if not success and retries > 0:
595 # failed
617 # failed
596 self.retries[msg_id] = retries - 1
618 self.retries[msg_id] = retries - 1
597 self.handle_unmet_dependency(idents, parent)
619 self.handle_unmet_dependency(idents, parent)
598 else:
620 else:
599 del self.retries[msg_id]
621 del self.retries[msg_id]
600 # relay to client and update graph
622 # relay to client and update graph
601 self.handle_result(idents, parent, raw_msg, success)
623 self.handle_result(idents, parent, raw_msg, success)
602 # send to Hub monitor
624 # send to Hub monitor
603 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
625 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
604 else:
626 else:
605 self.handle_unmet_dependency(idents, parent)
627 self.handle_unmet_dependency(idents, parent)
606
628
607 def handle_result(self, idents, parent, raw_msg, success=True):
629 def handle_result(self, idents, parent, raw_msg, success=True):
608 """handle a real task result, either success or failure"""
630 """handle a real task result, either success or failure"""
609 # first, relay result to client
631 # first, relay result to client
610 engine = idents[0]
632 engine = idents[0]
611 client = idents[1]
633 client = idents[1]
612 # swap_ids for ROUTER-ROUTER mirror
634 # swap_ids for ROUTER-ROUTER mirror
613 raw_msg[:2] = [client,engine]
635 raw_msg[:2] = [client,engine]
614 # print (map(str, raw_msg[:4]))
636 # print (map(str, raw_msg[:4]))
615 self.client_stream.send_multipart(raw_msg, copy=False)
637 self.client_stream.send_multipart(raw_msg, copy=False)
616 # now, update our data structures
638 # now, update our data structures
617 msg_id = parent['msg_id']
639 msg_id = parent['msg_id']
618 self.pending[engine].pop(msg_id)
640 self.pending[engine].pop(msg_id)
619 if success:
641 if success:
620 self.completed[engine].add(msg_id)
642 self.completed[engine].add(msg_id)
621 self.all_completed.add(msg_id)
643 self.all_completed.add(msg_id)
622 else:
644 else:
623 self.failed[engine].add(msg_id)
645 self.failed[engine].add(msg_id)
624 self.all_failed.add(msg_id)
646 self.all_failed.add(msg_id)
625 self.all_done.add(msg_id)
647 self.all_done.add(msg_id)
626 self.destinations[msg_id] = engine
648 self.destinations[msg_id] = engine
627
649
628 self.update_graph(msg_id, success)
650 self.update_graph(msg_id, success)
629
651
630 def handle_unmet_dependency(self, idents, parent):
652 def handle_unmet_dependency(self, idents, parent):
631 """handle an unmet dependency"""
653 """handle an unmet dependency"""
632 engine = idents[0]
654 engine = idents[0]
633 msg_id = parent['msg_id']
655 msg_id = parent['msg_id']
634
656
635 job = self.pending[engine].pop(msg_id)
657 job = self.pending[engine].pop(msg_id)
636 job.blacklist.add(engine)
658 job.blacklist.add(engine)
637
659
638 if job.blacklist == job.targets:
660 if job.blacklist == job.targets:
639 self.depending[msg_id] = job
661 self.depending[msg_id] = job
640 self.fail_unreachable(msg_id)
662 self.fail_unreachable(msg_id)
641 elif not self.maybe_run(job):
663 elif not self.maybe_run(job):
642 # resubmit failed
664 # resubmit failed
643 if msg_id not in self.all_failed:
665 if msg_id not in self.all_failed:
644 # put it back in our dependency tree
666 # put it back in our dependency tree
645 self.save_unmet(job)
667 self.save_unmet(job)
646
668
647 if self.hwm:
669 if self.hwm:
648 try:
670 try:
649 idx = self.targets.index(engine)
671 idx = self.targets.index(engine)
650 except ValueError:
672 except ValueError:
651 pass # skip load-update for dead engines
673 pass # skip load-update for dead engines
652 else:
674 else:
653 if self.loads[idx] == self.hwm-1:
675 if self.loads[idx] == self.hwm-1:
654 self.update_graph(None)
676 self.update_graph(None)
655
677
656
678
657
679
658 def update_graph(self, dep_id=None, success=True):
680 def update_graph(self, dep_id=None, success=True):
659 """dep_id just finished. Update our dependency
681 """dep_id just finished. Update our dependency
660 graph and submit any jobs that just became runable.
682 graph and submit any jobs that just became runable.
661
683
662 Called with dep_id=None to update entire graph for hwm, but without finishing
684 Called with dep_id=None to update entire graph for hwm, but without finishing
663 a task.
685 a task.
664 """
686 """
665 # print ("\n\n***********")
687 # print ("\n\n***********")
666 # pprint (dep_id)
688 # pprint (dep_id)
667 # pprint (self.graph)
689 # pprint (self.graph)
668 # pprint (self.depending)
690 # pprint (self.depending)
669 # pprint (self.all_completed)
691 # pprint (self.all_completed)
670 # pprint (self.all_failed)
692 # pprint (self.all_failed)
671 # print ("\n\n***********\n\n")
693 # print ("\n\n***********\n\n")
672 # update any jobs that depended on the dependency
694 # update any jobs that depended on the dependency
673 jobs = self.graph.pop(dep_id, [])
695 jobs = self.graph.pop(dep_id, [])
674
696
675 # recheck *all* jobs if
697 # recheck *all* jobs if
676 # a) we have HWM and an engine just become no longer full
698 # a) we have HWM and an engine just become no longer full
677 # or b) dep_id was given as None
699 # or b) dep_id was given as None
678
700
679 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
701 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
680 jobs = self.depending.keys()
702 jobs = self.depending.keys()
681
703
682 for msg_id in sorted(jobs, key=lambda msg_id: self.depending[msg_id].timestamp):
704 for msg_id in sorted(jobs, key=lambda msg_id: self.depending[msg_id].timestamp):
683 job = self.depending[msg_id]
705 job = self.depending[msg_id]
684
706
685 if job.after.unreachable(self.all_completed, self.all_failed)\
707 if job.after.unreachable(self.all_completed, self.all_failed)\
686 or job.follow.unreachable(self.all_completed, self.all_failed):
708 or job.follow.unreachable(self.all_completed, self.all_failed):
687 self.fail_unreachable(msg_id)
709 self.fail_unreachable(msg_id)
688
710
689 elif job.after.check(self.all_completed, self.all_failed): # time deps met, maybe run
711 elif job.after.check(self.all_completed, self.all_failed): # time deps met, maybe run
690 if self.maybe_run(job):
712 if self.maybe_run(job):
691
713
692 self.depending.pop(msg_id)
714 self.depending.pop(msg_id)
693 for mid in job.dependents:
715 for mid in job.dependents:
694 if mid in self.graph:
716 if mid in self.graph:
695 self.graph[mid].remove(msg_id)
717 self.graph[mid].remove(msg_id)
696
718
697 #----------------------------------------------------------------------
719 #----------------------------------------------------------------------
698 # methods to be overridden by subclasses
720 # methods to be overridden by subclasses
699 #----------------------------------------------------------------------
721 #----------------------------------------------------------------------
700
722
701 def add_job(self, idx):
723 def add_job(self, idx):
702 """Called after self.targets[idx] just got the job with header.
724 """Called after self.targets[idx] just got the job with header.
703 Override with subclasses. The default ordering is simple LRU.
725 Override with subclasses. The default ordering is simple LRU.
704 The default loads are the number of outstanding jobs."""
726 The default loads are the number of outstanding jobs."""
705 self.loads[idx] += 1
727 self.loads[idx] += 1
706 for lis in (self.targets, self.loads):
728 for lis in (self.targets, self.loads):
707 lis.append(lis.pop(idx))
729 lis.append(lis.pop(idx))
708
730
709
731
710 def finish_job(self, idx):
732 def finish_job(self, idx):
711 """Called after self.targets[idx] just finished a job.
733 """Called after self.targets[idx] just finished a job.
712 Override with subclasses."""
734 Override with subclasses."""
713 self.loads[idx] -= 1
735 self.loads[idx] -= 1
714
736
715
737
716
738
717 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, config=None,
739 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=None,
718 logname='root', log_url=None, loglevel=logging.DEBUG,
740 logname='root', log_url=None, loglevel=logging.DEBUG,
719 identity=b'task', in_thread=False):
741 identity=b'task', in_thread=False):
720
742
721 ZMQStream = zmqstream.ZMQStream
743 ZMQStream = zmqstream.ZMQStream
722
744
723 if config:
745 if config:
724 # unwrap dict back into Config
746 # unwrap dict back into Config
725 config = Config(config)
747 config = Config(config)
726
748
727 if in_thread:
749 if in_thread:
728 # use instance() to get the same Context/Loop as our parent
750 # use instance() to get the same Context/Loop as our parent
729 ctx = zmq.Context.instance()
751 ctx = zmq.Context.instance()
730 loop = ioloop.IOLoop.instance()
752 loop = ioloop.IOLoop.instance()
731 else:
753 else:
732 # in a process, don't use instance()
754 # in a process, don't use instance()
733 # for safety with multiprocessing
755 # for safety with multiprocessing
734 ctx = zmq.Context()
756 ctx = zmq.Context()
735 loop = ioloop.IOLoop()
757 loop = ioloop.IOLoop()
736 ins = ZMQStream(ctx.socket(zmq.ROUTER),loop)
758 ins = ZMQStream(ctx.socket(zmq.ROUTER),loop)
737 ins.setsockopt(zmq.IDENTITY, identity)
759 ins.setsockopt(zmq.IDENTITY, identity+'_in')
738 ins.bind(in_addr)
760 ins.bind(in_addr)
739
761
740 outs = ZMQStream(ctx.socket(zmq.ROUTER),loop)
762 outs = ZMQStream(ctx.socket(zmq.ROUTER),loop)
741 outs.setsockopt(zmq.IDENTITY, identity)
763 outs.setsockopt(zmq.IDENTITY, identity+'_out')
742 outs.bind(out_addr)
764 outs.bind(out_addr)
743 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
765 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
744 mons.connect(mon_addr)
766 mons.connect(mon_addr)
745 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
767 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
746 nots.setsockopt(zmq.SUBSCRIBE, b'')
768 nots.setsockopt(zmq.SUBSCRIBE, b'')
747 nots.connect(not_addr)
769 nots.connect(not_addr)
748
770
771 querys = ZMQStream(ctx.socket(zmq.DEALER),loop)
772 querys.connect(reg_addr)
773
749 # setup logging.
774 # setup logging.
750 if in_thread:
775 if in_thread:
751 log = Application.instance().log
776 log = Application.instance().log
752 else:
777 else:
753 if log_url:
778 if log_url:
754 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
779 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
755 else:
780 else:
756 log = local_logger(logname, loglevel)
781 log = local_logger(logname, loglevel)
757
782
758 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
783 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
759 mon_stream=mons, notifier_stream=nots,
784 mon_stream=mons, notifier_stream=nots,
785 query_stream=querys,
760 loop=loop, log=log,
786 loop=loop, log=log,
761 config=config)
787 config=config)
762 scheduler.start()
788 scheduler.start()
763 if not in_thread:
789 if not in_thread:
764 try:
790 try:
765 loop.start()
791 loop.start()
766 except KeyboardInterrupt:
792 except KeyboardInterrupt:
767 scheduler.log.critical("Interrupted, exiting...")
793 scheduler.log.critical("Interrupted, exiting...")
768
794
@@ -1,231 +1,231 b''
1 """A simple engine that talks to a controller over 0MQ.
1 """A simple engine that talks to a controller over 0MQ.
2 it handles registration, etc. and launches a kernel
2 it handles registration, etc. and launches a kernel
3 connected to the Controller's Schedulers.
3 connected to the Controller's Schedulers.
4
4
5 Authors:
5 Authors:
6
6
7 * Min RK
7 * Min RK
8 """
8 """
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10 # Copyright (C) 2010-2011 The IPython Development Team
10 # Copyright (C) 2010-2011 The IPython Development Team
11 #
11 #
12 # Distributed under the terms of the BSD License. The full license is in
12 # Distributed under the terms of the BSD License. The full license is in
13 # the file COPYING, distributed as part of this software.
13 # the file COPYING, distributed as part of this software.
14 #-----------------------------------------------------------------------------
14 #-----------------------------------------------------------------------------
15
15
16 from __future__ import print_function
16 from __future__ import print_function
17
17
18 import sys
18 import sys
19 import time
19 import time
20 from getpass import getpass
20 from getpass import getpass
21
21
22 import zmq
22 import zmq
23 from zmq.eventloop import ioloop, zmqstream
23 from zmq.eventloop import ioloop, zmqstream
24
24
25 from IPython.external.ssh import tunnel
25 from IPython.external.ssh import tunnel
26 # internal
26 # internal
27 from IPython.utils.traitlets import (
27 from IPython.utils.traitlets import (
28 Instance, Dict, Integer, Type, CFloat, Unicode, CBytes, Bool
28 Instance, Dict, Integer, Type, CFloat, Unicode, CBytes, Bool
29 )
29 )
30 from IPython.utils.py3compat import cast_bytes
30 from IPython.utils.py3compat import cast_bytes
31
31
32 from IPython.parallel.controller.heartmonitor import Heart
32 from IPython.parallel.controller.heartmonitor import Heart
33 from IPython.parallel.factory import RegistrationFactory
33 from IPython.parallel.factory import RegistrationFactory
34 from IPython.parallel.util import disambiguate_url
34 from IPython.parallel.util import disambiguate_url
35
35
36 from IPython.zmq.session import Message
36 from IPython.zmq.session import Message
37 from IPython.zmq.ipkernel import Kernel
37 from IPython.zmq.ipkernel import Kernel
38
38
39 class EngineFactory(RegistrationFactory):
39 class EngineFactory(RegistrationFactory):
40 """IPython engine"""
40 """IPython engine"""
41
41
42 # configurables:
42 # configurables:
43 out_stream_factory=Type('IPython.zmq.iostream.OutStream', config=True,
43 out_stream_factory=Type('IPython.zmq.iostream.OutStream', config=True,
44 help="""The OutStream for handling stdout/err.
44 help="""The OutStream for handling stdout/err.
45 Typically 'IPython.zmq.iostream.OutStream'""")
45 Typically 'IPython.zmq.iostream.OutStream'""")
46 display_hook_factory=Type('IPython.zmq.displayhook.ZMQDisplayHook', config=True,
46 display_hook_factory=Type('IPython.zmq.displayhook.ZMQDisplayHook', config=True,
47 help="""The class for handling displayhook.
47 help="""The class for handling displayhook.
48 Typically 'IPython.zmq.displayhook.ZMQDisplayHook'""")
48 Typically 'IPython.zmq.displayhook.ZMQDisplayHook'""")
49 location=Unicode(config=True,
49 location=Unicode(config=True,
50 help="""The location (an IP address) of the controller. This is
50 help="""The location (an IP address) of the controller. This is
51 used for disambiguating URLs, to determine whether
51 used for disambiguating URLs, to determine whether
52 loopback should be used to connect or the public address.""")
52 loopback should be used to connect or the public address.""")
53 timeout=CFloat(5, config=True,
53 timeout=CFloat(5, config=True,
54 help="""The time (in seconds) to wait for the Controller to respond
54 help="""The time (in seconds) to wait for the Controller to respond
55 to registration requests before giving up.""")
55 to registration requests before giving up.""")
56 sshserver=Unicode(config=True,
56 sshserver=Unicode(config=True,
57 help="""The SSH server to use for tunneling connections to the Controller.""")
57 help="""The SSH server to use for tunneling connections to the Controller.""")
58 sshkey=Unicode(config=True,
58 sshkey=Unicode(config=True,
59 help="""The SSH private key file to use when tunneling connections to the Controller.""")
59 help="""The SSH private key file to use when tunneling connections to the Controller.""")
60 paramiko=Bool(sys.platform == 'win32', config=True,
60 paramiko=Bool(sys.platform == 'win32', config=True,
61 help="""Whether to use paramiko instead of openssh for tunnels.""")
61 help="""Whether to use paramiko instead of openssh for tunnels.""")
62
62
63 # not configurable:
63 # not configurable:
64 connection_info = Dict()
64 connection_info = Dict()
65 user_ns = Dict()
65 user_ns = Dict()
66 id = Integer(allow_none=True)
66 id = Integer(allow_none=True)
67 registrar = Instance('zmq.eventloop.zmqstream.ZMQStream')
67 registrar = Instance('zmq.eventloop.zmqstream.ZMQStream')
68 kernel = Instance(Kernel)
68 kernel = Instance(Kernel)
69
69
70 bident = CBytes()
70 bident = CBytes()
71 ident = Unicode()
71 ident = Unicode()
72 def _ident_changed(self, name, old, new):
72 def _ident_changed(self, name, old, new):
73 self.bident = cast_bytes(new)
73 self.bident = cast_bytes(new)
74 using_ssh=Bool(False)
74 using_ssh=Bool(False)
75
75
76
76
77 def __init__(self, **kwargs):
77 def __init__(self, **kwargs):
78 super(EngineFactory, self).__init__(**kwargs)
78 super(EngineFactory, self).__init__(**kwargs)
79 self.ident = self.session.session
79 self.ident = self.session.session
80
80
81 def init_connector(self):
81 def init_connector(self):
82 """construct connection function, which handles tunnels."""
82 """construct connection function, which handles tunnels."""
83 self.using_ssh = bool(self.sshkey or self.sshserver)
83 self.using_ssh = bool(self.sshkey or self.sshserver)
84
84
85 if self.sshkey and not self.sshserver:
85 if self.sshkey and not self.sshserver:
86 # We are using ssh directly to the controller, tunneling localhost to localhost
86 # We are using ssh directly to the controller, tunneling localhost to localhost
87 self.sshserver = self.url.split('://')[1].split(':')[0]
87 self.sshserver = self.url.split('://')[1].split(':')[0]
88
88
89 if self.using_ssh:
89 if self.using_ssh:
90 if tunnel.try_passwordless_ssh(self.sshserver, self.sshkey, self.paramiko):
90 if tunnel.try_passwordless_ssh(self.sshserver, self.sshkey, self.paramiko):
91 password=False
91 password=False
92 else:
92 else:
93 password = getpass("SSH Password for %s: "%self.sshserver)
93 password = getpass("SSH Password for %s: "%self.sshserver)
94 else:
94 else:
95 password = False
95 password = False
96
96
97 def connect(s, url):
97 def connect(s, url):
98 url = disambiguate_url(url, self.location)
98 url = disambiguate_url(url, self.location)
99 if self.using_ssh:
99 if self.using_ssh:
100 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
100 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
101 return tunnel.tunnel_connection(s, url, self.sshserver,
101 return tunnel.tunnel_connection(s, url, self.sshserver,
102 keyfile=self.sshkey, paramiko=self.paramiko,
102 keyfile=self.sshkey, paramiko=self.paramiko,
103 password=password,
103 password=password,
104 )
104 )
105 else:
105 else:
106 return s.connect(url)
106 return s.connect(url)
107
107
108 def maybe_tunnel(url):
108 def maybe_tunnel(url):
109 """like connect, but don't complete the connection (for use by heartbeat)"""
109 """like connect, but don't complete the connection (for use by heartbeat)"""
110 url = disambiguate_url(url, self.location)
110 url = disambiguate_url(url, self.location)
111 if self.using_ssh:
111 if self.using_ssh:
112 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
112 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
113 url,tunnelobj = tunnel.open_tunnel(url, self.sshserver,
113 url,tunnelobj = tunnel.open_tunnel(url, self.sshserver,
114 keyfile=self.sshkey, paramiko=self.paramiko,
114 keyfile=self.sshkey, paramiko=self.paramiko,
115 password=password,
115 password=password,
116 )
116 )
117 return str(url)
117 return str(url)
118 return connect, maybe_tunnel
118 return connect, maybe_tunnel
119
119
120 def register(self):
120 def register(self):
121 """send the registration_request"""
121 """send the registration_request"""
122
122
123 self.log.info("Registering with controller at %s"%self.url)
123 self.log.info("Registering with controller at %s"%self.url)
124 ctx = self.context
124 ctx = self.context
125 connect,maybe_tunnel = self.init_connector()
125 connect,maybe_tunnel = self.init_connector()
126 reg = ctx.socket(zmq.DEALER)
126 reg = ctx.socket(zmq.DEALER)
127 reg.setsockopt(zmq.IDENTITY, self.bident)
127 reg.setsockopt(zmq.IDENTITY, self.bident)
128 connect(reg, self.url)
128 connect(reg, self.url)
129 self.registrar = zmqstream.ZMQStream(reg, self.loop)
129 self.registrar = zmqstream.ZMQStream(reg, self.loop)
130
130
131
131
132 content = dict(queue=self.ident, heartbeat=self.ident, control=self.ident)
132 content = dict(uuid=self.ident)
133 self.registrar.on_recv(lambda msg: self.complete_registration(msg, connect, maybe_tunnel))
133 self.registrar.on_recv(lambda msg: self.complete_registration(msg, connect, maybe_tunnel))
134 # print (self.session.key)
134 # print (self.session.key)
135 self.session.send(self.registrar, "registration_request", content=content)
135 self.session.send(self.registrar, "registration_request", content=content)
136
136
137 def complete_registration(self, msg, connect, maybe_tunnel):
137 def complete_registration(self, msg, connect, maybe_tunnel):
138 # print msg
138 # print msg
139 self._abort_dc.stop()
139 self._abort_dc.stop()
140 ctx = self.context
140 ctx = self.context
141 loop = self.loop
141 loop = self.loop
142 identity = self.bident
142 identity = self.bident
143 idents,msg = self.session.feed_identities(msg)
143 idents,msg = self.session.feed_identities(msg)
144 msg = self.session.unserialize(msg)
144 msg = self.session.unserialize(msg)
145 content = msg['content']
145 content = msg['content']
146 info = self.connection_info
146 info = self.connection_info
147
147
148 def url(key):
148 def url(key):
149 """get zmq url for given channel"""
149 """get zmq url for given channel"""
150 return str(info["interface"] + ":%i" % info[key])
150 return str(info["interface"] + ":%i" % info[key])
151
151
152 if content['status'] == 'ok':
152 if content['status'] == 'ok':
153 self.id = int(content['id'])
153 self.id = int(content['id'])
154
154
155 # launch heartbeat
155 # launch heartbeat
156 # possibly forward hb ports with tunnels
156 # possibly forward hb ports with tunnels
157 hb_ping = maybe_tunnel(url('hb_ping'))
157 hb_ping = maybe_tunnel(url('hb_ping'))
158 hb_pong = maybe_tunnel(url('hb_pong'))
158 hb_pong = maybe_tunnel(url('hb_pong'))
159
159
160 heart = Heart(hb_ping, hb_pong, heart_id=identity)
160 heart = Heart(hb_ping, hb_pong, heart_id=identity)
161 heart.start()
161 heart.start()
162
162
163 # create Shell Connections (MUX, Task, etc.):
163 # create Shell Connections (MUX, Task, etc.):
164 shell_addrs = url('mux'), url('task')
164 shell_addrs = url('mux'), url('task')
165
165
166 # Use only one shell stream for mux and tasks
166 # Use only one shell stream for mux and tasks
167 stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
167 stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
168 stream.setsockopt(zmq.IDENTITY, identity)
168 stream.setsockopt(zmq.IDENTITY, identity)
169 shell_streams = [stream]
169 shell_streams = [stream]
170 for addr in shell_addrs:
170 for addr in shell_addrs:
171 connect(stream, addr)
171 connect(stream, addr)
172
172
173 # control stream:
173 # control stream:
174 control_addr = url('control')
174 control_addr = url('control')
175 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
175 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
176 control_stream.setsockopt(zmq.IDENTITY, identity)
176 control_stream.setsockopt(zmq.IDENTITY, identity)
177 connect(control_stream, control_addr)
177 connect(control_stream, control_addr)
178
178
179 # create iopub stream:
179 # create iopub stream:
180 iopub_addr = url('iopub')
180 iopub_addr = url('iopub')
181 iopub_socket = ctx.socket(zmq.PUB)
181 iopub_socket = ctx.socket(zmq.PUB)
182 iopub_socket.setsockopt(zmq.IDENTITY, identity)
182 iopub_socket.setsockopt(zmq.IDENTITY, identity)
183 connect(iopub_socket, iopub_addr)
183 connect(iopub_socket, iopub_addr)
184
184
185 # disable history:
185 # disable history:
186 self.config.HistoryManager.hist_file = ':memory:'
186 self.config.HistoryManager.hist_file = ':memory:'
187
187
188 # Redirect input streams and set a display hook.
188 # Redirect input streams and set a display hook.
189 if self.out_stream_factory:
189 if self.out_stream_factory:
190 sys.stdout = self.out_stream_factory(self.session, iopub_socket, u'stdout')
190 sys.stdout = self.out_stream_factory(self.session, iopub_socket, u'stdout')
191 sys.stdout.topic = cast_bytes('engine.%i.stdout' % self.id)
191 sys.stdout.topic = cast_bytes('engine.%i.stdout' % self.id)
192 sys.stderr = self.out_stream_factory(self.session, iopub_socket, u'stderr')
192 sys.stderr = self.out_stream_factory(self.session, iopub_socket, u'stderr')
193 sys.stderr.topic = cast_bytes('engine.%i.stderr' % self.id)
193 sys.stderr.topic = cast_bytes('engine.%i.stderr' % self.id)
194 if self.display_hook_factory:
194 if self.display_hook_factory:
195 sys.displayhook = self.display_hook_factory(self.session, iopub_socket)
195 sys.displayhook = self.display_hook_factory(self.session, iopub_socket)
196 sys.displayhook.topic = cast_bytes('engine.%i.pyout' % self.id)
196 sys.displayhook.topic = cast_bytes('engine.%i.pyout' % self.id)
197
197
198 self.kernel = Kernel(config=self.config, int_id=self.id, ident=self.ident, session=self.session,
198 self.kernel = Kernel(config=self.config, int_id=self.id, ident=self.ident, session=self.session,
199 control_stream=control_stream, shell_streams=shell_streams, iopub_socket=iopub_socket,
199 control_stream=control_stream, shell_streams=shell_streams, iopub_socket=iopub_socket,
200 loop=loop, user_ns=self.user_ns, log=self.log)
200 loop=loop, user_ns=self.user_ns, log=self.log)
201 self.kernel.shell.display_pub.topic = cast_bytes('engine.%i.displaypub' % self.id)
201 self.kernel.shell.display_pub.topic = cast_bytes('engine.%i.displaypub' % self.id)
202 self.kernel.start()
202 self.kernel.start()
203
203
204
204
205 else:
205 else:
206 self.log.fatal("Registration Failed: %s"%msg)
206 self.log.fatal("Registration Failed: %s"%msg)
207 raise Exception("Registration Failed: %s"%msg)
207 raise Exception("Registration Failed: %s"%msg)
208
208
209 self.log.info("Completed registration with id %i"%self.id)
209 self.log.info("Completed registration with id %i"%self.id)
210
210
211
211
212 def abort(self):
212 def abort(self):
213 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
213 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
214 if self.url.startswith('127.'):
214 if self.url.startswith('127.'):
215 self.log.fatal("""
215 self.log.fatal("""
216 If the controller and engines are not on the same machine,
216 If the controller and engines are not on the same machine,
217 you will have to instruct the controller to listen on an external IP (in ipcontroller_config.py):
217 you will have to instruct the controller to listen on an external IP (in ipcontroller_config.py):
218 c.HubFactory.ip='*' # for all interfaces, internal and external
218 c.HubFactory.ip='*' # for all interfaces, internal and external
219 c.HubFactory.ip='192.168.1.101' # or any interface that the engines can see
219 c.HubFactory.ip='192.168.1.101' # or any interface that the engines can see
220 or tunnel connections via ssh.
220 or tunnel connections via ssh.
221 """)
221 """)
222 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
222 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
223 time.sleep(1)
223 time.sleep(1)
224 sys.exit(255)
224 sys.exit(255)
225
225
226 def start(self):
226 def start(self):
227 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
227 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
228 dc.start()
228 dc.start()
229 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
229 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
230 self._abort_dc.start()
230 self._abort_dc.start()
231
231
@@ -1,378 +1,368 b''
1 .. _parallel_messages:
1 .. _parallel_messages:
2
2
3 Messaging for Parallel Computing
3 Messaging for Parallel Computing
4 ================================
4 ================================
5
5
6 This is an extension of the :ref:`messaging <messaging>` doc. Diagrams of the connections
6 This is an extension of the :ref:`messaging <messaging>` doc. Diagrams of the connections
7 can be found in the :ref:`parallel connections <parallel_connections>` doc.
7 can be found in the :ref:`parallel connections <parallel_connections>` doc.
8
8
9
9
10 ZMQ messaging is also used in the parallel computing IPython system. All messages to/from
10 ZMQ messaging is also used in the parallel computing IPython system. All messages to/from
11 kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue
11 kernels remain the same as the single kernel model, and are forwarded through a ZMQ Queue
12 device. The controller receives all messages and replies in these channels, and saves
12 device. The controller receives all messages and replies in these channels, and saves
13 results for future use.
13 results for future use.
14
14
15 The Controller
15 The Controller
16 --------------
16 --------------
17
17
18 The controller is the central collection of processes in the IPython parallel computing
18 The controller is the central collection of processes in the IPython parallel computing
19 model. It has two major components:
19 model. It has two major components:
20
20
21 * The Hub
21 * The Hub
22 * A collection of Schedulers
22 * A collection of Schedulers
23
23
24 The Hub
24 The Hub
25 -------
25 -------
26
26
27 The Hub is the central process for monitoring the state of the engines, and all task
27 The Hub is the central process for monitoring the state of the engines, and all task
28 requests and results. It has no role in execution and does no relay of messages, so
28 requests and results. It has no role in execution and does no relay of messages, so
29 large blocking requests or database actions in the Hub do not have the ability to impede
29 large blocking requests or database actions in the Hub do not have the ability to impede
30 job submission and results.
30 job submission and results.
31
31
32 Registration (``ROUTER``)
32 Registration (``ROUTER``)
33 ***********************
33 ***********************
34
34
35 The first function of the Hub is to facilitate and monitor connections of clients
35 The first function of the Hub is to facilitate and monitor connections of clients
36 and engines. Both client and engine registration are handled by the same socket, so only
36 and engines. Both client and engine registration are handled by the same socket, so only
37 one ip/port pair is needed to connect any number of connections and clients.
37 one ip/port pair is needed to connect any number of connections and clients.
38
38
39 Engines register with the ``zmq.IDENTITY`` of their two ``DEALER`` sockets, one for the
39 Engines register with the ``zmq.IDENTITY`` of their two ``DEALER`` sockets, one for the
40 queue, which receives execute requests, and one for the heartbeat, which is used to
40 queue, which receives execute requests, and one for the heartbeat, which is used to
41 monitor the survival of the Engine process.
41 monitor the survival of the Engine process.
42
42
43 Message type: ``registration_request``::
43 Message type: ``registration_request``::
44
44
45 content = {
45 content = {
46 'queue' : 'abcd-1234-...', # the MUX queue zmq.IDENTITY
46 'uuid' : 'abcd-1234-...', # the zmq.IDENTITY of the engine's sockets
47 'control' : 'abcd-1234-...', # the control queue zmq.IDENTITY
48 'heartbeat' : 'abcd-1234-...' # the heartbeat zmq.IDENTITY
49 }
47 }
50
48
51 .. note::
49 .. note::
52
50
53 these are always the same, at least for now.
51 these are always the same, at least for now.
54
52
55 The Controller replies to an Engine's registration request with the engine's integer ID,
53 The Controller replies to an Engine's registration request with the engine's integer ID,
56 and all the remaining connection information for connecting the heartbeat process, and
54 and all the remaining connection information for connecting the heartbeat process, and
57 kernel queue socket(s). The message status will be an error if the Engine requests IDs that
55 kernel queue socket(s). The message status will be an error if the Engine requests IDs that
58 already in use.
56 already in use.
59
57
60 Message type: ``registration_reply``::
58 Message type: ``registration_reply``::
61
59
62 content = {
60 content = {
63 'status' : 'ok', # or 'error'
61 'status' : 'ok', # or 'error'
64 # if ok:
62 # if ok:
65 'id' : 0, # int, the engine id
63 'id' : 0, # int, the engine id
66 'queue' : 'tcp://127.0.0.1:12345', # connection for engine side of the queue
67 'control' : 'tcp://...', # addr for control queue
68 'heartbeat' : ('tcp://...','tcp://...'), # tuple containing two interfaces needed for heartbeat
69 'task' : 'tcp://...', # addr for task queue, or None if no task queue running
70 }
64 }
71
65
72 Clients use the same socket as engines to start their connections. Connection requests
66 Clients use the same socket as engines to start their connections. Connection requests
73 from clients need no information:
67 from clients need no information:
74
68
75 Message type: ``connection_request``::
69 Message type: ``connection_request``::
76
70
77 content = {}
71 content = {}
78
72
79 The reply to a Client registration request contains the connection information for the
73 The reply to a Client registration request contains the connection information for the
80 multiplexer and load balanced queues, as well as the address for direct hub
74 multiplexer and load balanced queues, as well as the address for direct hub
81 queries. If any of these addresses is `None`, that functionality is not available.
75 queries. If any of these addresses is `None`, that functionality is not available.
82
76
83 Message type: ``connection_reply``::
77 Message type: ``connection_reply``::
84
78
85 content = {
79 content = {
86 'status' : 'ok', # or 'error'
80 'status' : 'ok', # or 'error'
87 # if ok:
88 'queue' : 'tcp://127.0.0.1:12345', # connection for client side of the MUX queue
89 'task' : ('lru','tcp...'), # routing scheme and addr for task queue (len 2 tuple)
90 'query' : 'tcp...', # addr for methods to query the hub, like queue_request, etc.
91 'control' : 'tcp...', # addr for control methods, like abort, etc.
92 }
81 }
93
82
94 Heartbeat
83 Heartbeat
95 *********
84 *********
96
85
97 The hub uses a heartbeat system to monitor engines, and track when they become
86 The hub uses a heartbeat system to monitor engines, and track when they become
98 unresponsive. As described in :ref:`messaging <messaging>`, and shown in :ref:`connections
87 unresponsive. As described in :ref:`messaging <messaging>`, and shown in :ref:`connections
99 <parallel_connections>`.
88 <parallel_connections>`.
100
89
101 Notification (``PUB``)
90 Notification (``PUB``)
102 **********************
91 **********************
103
92
104 The hub publishes all engine registration/unregistration events on a ``PUB`` socket.
93 The hub publishes all engine registration/unregistration events on a ``PUB`` socket.
105 This allows clients to have up-to-date engine ID sets without polling. Registration
94 This allows clients to have up-to-date engine ID sets without polling. Registration
106 notifications contain both the integer engine ID and the queue ID, which is necessary for
95 notifications contain both the integer engine ID and the queue ID, which is necessary for
107 sending messages via the Multiplexer Queue and Control Queues.
96 sending messages via the Multiplexer Queue and Control Queues.
108
97
109 Message type: ``registration_notification``::
98 Message type: ``registration_notification``::
110
99
111 content = {
100 content = {
112 'id' : 0, # engine ID that has been registered
101 'id' : 0, # engine ID that has been registered
113 'queue' : 'engine_id' # the IDENT for the engine's queue
102 'uuid' : 'engine_id' # the IDENT for the engine's sockets
114 }
103 }
115
104
116 Message type : ``unregistration_notification``::
105 Message type : ``unregistration_notification``::
117
106
118 content = {
107 content = {
119 'id' : 0 # engine ID that has been unregistered
108 'id' : 0 # engine ID that has been unregistered
109 'uuid' : 'engine_id' # the IDENT for the engine's sockets
120 }
110 }
121
111
122
112
123 Client Queries (``ROUTER``)
113 Client Queries (``ROUTER``)
124 *************************
114 *************************
125
115
126 The hub monitors and logs all queue traffic, so that clients can retrieve past
116 The hub monitors and logs all queue traffic, so that clients can retrieve past
127 results or monitor pending tasks. This information may reside in-memory on the Hub, or
117 results or monitor pending tasks. This information may reside in-memory on the Hub, or
128 on disk in a database (SQLite and MongoDB are currently supported). These requests are
118 on disk in a database (SQLite and MongoDB are currently supported). These requests are
129 handled by the same socket as registration.
119 handled by the same socket as registration.
130
120
131
121
132 :func:`queue_request` requests can specify multiple engines to query via the `targets`
122 :func:`queue_request` requests can specify multiple engines to query via the `targets`
133 element. A verbose flag can be passed, to determine whether the result should be the list
123 element. A verbose flag can be passed, to determine whether the result should be the list
134 of `msg_ids` in the queue or simply the length of each list.
124 of `msg_ids` in the queue or simply the length of each list.
135
125
136 Message type: ``queue_request``::
126 Message type: ``queue_request``::
137
127
138 content = {
128 content = {
139 'verbose' : True, # whether return should be lists themselves or just lens
129 'verbose' : True, # whether return should be lists themselves or just lens
140 'targets' : [0,3,1] # list of ints
130 'targets' : [0,3,1] # list of ints
141 }
131 }
142
132
143 The content of a reply to a :func:`queue_request` request is a dict, keyed by the engine
133 The content of a reply to a :func:`queue_request` request is a dict, keyed by the engine
144 IDs. Note that they will be the string representation of the integer keys, since JSON
134 IDs. Note that they will be the string representation of the integer keys, since JSON
145 cannot handle number keys. The three keys of each dict are::
135 cannot handle number keys. The three keys of each dict are::
146
136
147 'completed' : messages submitted via any queue that ran on the engine
137 'completed' : messages submitted via any queue that ran on the engine
148 'queue' : jobs submitted via MUX queue, whose results have not been received
138 'queue' : jobs submitted via MUX queue, whose results have not been received
149 'tasks' : tasks that are known to have been submitted to the engine, but
139 'tasks' : tasks that are known to have been submitted to the engine, but
150 have not completed. Note that with the pure zmq scheduler, this will
140 have not completed. Note that with the pure zmq scheduler, this will
151 always be 0/[].
141 always be 0/[].
152
142
153 Message type: ``queue_reply``::
143 Message type: ``queue_reply``::
154
144
155 content = {
145 content = {
156 'status' : 'ok', # or 'error'
146 'status' : 'ok', # or 'error'
157 # if verbose=False:
147 # if verbose=False:
158 '0' : {'completed' : 1, 'queue' : 7, 'tasks' : 0},
148 '0' : {'completed' : 1, 'queue' : 7, 'tasks' : 0},
159 # if verbose=True:
149 # if verbose=True:
160 '1' : {'completed' : ['abcd-...','1234-...'], 'queue' : ['58008-'], 'tasks' : []},
150 '1' : {'completed' : ['abcd-...','1234-...'], 'queue' : ['58008-'], 'tasks' : []},
161 }
151 }
162
152
163 Clients can request individual results directly from the hub. This is primarily for
153 Clients can request individual results directly from the hub. This is primarily for
164 gathering results of executions not submitted by the requesting client, as the client
154 gathering results of executions not submitted by the requesting client, as the client
165 will have all its own results already. Requests are made by msg_id, and can contain one or
155 will have all its own results already. Requests are made by msg_id, and can contain one or
166 more msg_id. An additional boolean key 'statusonly' can be used to not request the
156 more msg_id. An additional boolean key 'statusonly' can be used to not request the
167 results, but simply poll the status of the jobs.
157 results, but simply poll the status of the jobs.
168
158
169 Message type: ``result_request``::
159 Message type: ``result_request``::
170
160
171 content = {
161 content = {
172 'msg_ids' : ['uuid','...'], # list of strs
162 'msg_ids' : ['uuid','...'], # list of strs
173 'targets' : [1,2,3], # list of int ids or uuids
163 'targets' : [1,2,3], # list of int ids or uuids
174 'statusonly' : False, # bool
164 'statusonly' : False, # bool
175 }
165 }
176
166
177 The :func:`result_request` reply contains the content objects of the actual execution
167 The :func:`result_request` reply contains the content objects of the actual execution
178 reply messages. If `statusonly=True`, then there will be only the 'pending' and
168 reply messages. If `statusonly=True`, then there will be only the 'pending' and
179 'completed' lists.
169 'completed' lists.
180
170
181
171
182 Message type: ``result_reply``::
172 Message type: ``result_reply``::
183
173
184 content = {
174 content = {
185 'status' : 'ok', # else error
175 'status' : 'ok', # else error
186 # if ok:
176 # if ok:
187 'acbd-...' : msg, # the content dict is keyed by msg_ids,
177 'acbd-...' : msg, # the content dict is keyed by msg_ids,
188 # values are the result messages
178 # values are the result messages
189 # there will be none of these if `statusonly=True`
179 # there will be none of these if `statusonly=True`
190 'pending' : ['msg_id','...'], # msg_ids still pending
180 'pending' : ['msg_id','...'], # msg_ids still pending
191 'completed' : ['msg_id','...'], # list of completed msg_ids
181 'completed' : ['msg_id','...'], # list of completed msg_ids
192 }
182 }
193 buffers = ['bufs','...'] # the buffers that contained the results of the objects.
183 buffers = ['bufs','...'] # the buffers that contained the results of the objects.
194 # this will be empty if no messages are complete, or if
184 # this will be empty if no messages are complete, or if
195 # statusonly is True.
185 # statusonly is True.
196
186
197 For memory management purposes, Clients can also instruct the hub to forget the
187 For memory management purposes, Clients can also instruct the hub to forget the
198 results of messages. This can be done by message ID or engine ID. Individual messages are
188 results of messages. This can be done by message ID or engine ID. Individual messages are
199 dropped by msg_id, and all messages completed on an engine are dropped by engine ID. This
189 dropped by msg_id, and all messages completed on an engine are dropped by engine ID. This
200 may no longer be necessary with the mongodb-based message logging backend.
190 may no longer be necessary with the mongodb-based message logging backend.
201
191
202 If the msg_ids element is the string ``'all'`` instead of a list, then all completed
192 If the msg_ids element is the string ``'all'`` instead of a list, then all completed
203 results are forgotten.
193 results are forgotten.
204
194
205 Message type: ``purge_request``::
195 Message type: ``purge_request``::
206
196
207 content = {
197 content = {
208 'msg_ids' : ['id1', 'id2',...], # list of msg_ids or 'all'
198 'msg_ids' : ['id1', 'id2',...], # list of msg_ids or 'all'
209 'engine_ids' : [0,2,4] # list of engine IDs
199 'engine_ids' : [0,2,4] # list of engine IDs
210 }
200 }
211
201
212 The reply to a purge request is simply the status 'ok' if the request succeeded, or an
202 The reply to a purge request is simply the status 'ok' if the request succeeded, or an
213 explanation of why it failed, such as requesting the purge of a nonexistent or pending
203 explanation of why it failed, such as requesting the purge of a nonexistent or pending
214 message.
204 message.
215
205
216 Message type: ``purge_reply``::
206 Message type: ``purge_reply``::
217
207
218 content = {
208 content = {
219 'status' : 'ok', # or 'error'
209 'status' : 'ok', # or 'error'
220 }
210 }
221
211
222
212
223 Schedulers
213 Schedulers
224 ----------
214 ----------
225
215
226 There are three basic schedulers:
216 There are three basic schedulers:
227
217
228 * Task Scheduler
218 * Task Scheduler
229 * MUX Scheduler
219 * MUX Scheduler
230 * Control Scheduler
220 * Control Scheduler
231
221
232 The MUX and Control schedulers are simple MonitoredQueue ØMQ devices, with ``ROUTER``
222 The MUX and Control schedulers are simple MonitoredQueue ØMQ devices, with ``ROUTER``
233 sockets on either side. This allows the queue to relay individual messages to particular
223 sockets on either side. This allows the queue to relay individual messages to particular
234 targets via ``zmq.IDENTITY`` routing. The Task scheduler may be a MonitoredQueue ØMQ
224 targets via ``zmq.IDENTITY`` routing. The Task scheduler may be a MonitoredQueue ØMQ
235 device, in which case the client-facing socket is ``ROUTER``, and the engine-facing socket
225 device, in which case the client-facing socket is ``ROUTER``, and the engine-facing socket
236 is ``DEALER``. The result of this is that client-submitted messages are load-balanced via
226 is ``DEALER``. The result of this is that client-submitted messages are load-balanced via
237 the ``DEALER`` socket, but the engine's replies to each message go to the requesting client.
227 the ``DEALER`` socket, but the engine's replies to each message go to the requesting client.
238
228
239 Raw ``DEALER`` scheduling is quite primitive, and doesn't allow message introspection, so
229 Raw ``DEALER`` scheduling is quite primitive, and doesn't allow message introspection, so
240 there are also Python Schedulers that can be used. These Schedulers behave in much the
230 there are also Python Schedulers that can be used. These Schedulers behave in much the
241 same way as a MonitoredQueue does from the outside, but have rich internal logic to
231 same way as a MonitoredQueue does from the outside, but have rich internal logic to
242 determine destinations, as well as handle dependency graphs Their sockets are always
232 determine destinations, as well as handle dependency graphs Their sockets are always
243 ``ROUTER`` on both sides.
233 ``ROUTER`` on both sides.
244
234
245 The Python task schedulers have an additional message type, which informs the Hub of
235 The Python task schedulers have an additional message type, which informs the Hub of
246 the destination of a task as soon as that destination is known.
236 the destination of a task as soon as that destination is known.
247
237
248 Message type: ``task_destination``::
238 Message type: ``task_destination``::
249
239
250 content = {
240 content = {
251 'msg_id' : 'abcd-1234-...', # the msg's uuid
241 'msg_id' : 'abcd-1234-...', # the msg's uuid
252 'engine_id' : '1234-abcd-...', # the destination engine's zmq.IDENTITY
242 'engine_id' : '1234-abcd-...', # the destination engine's zmq.IDENTITY
253 }
243 }
254
244
255 :func:`apply` and :func:`apply_bound`
245 :func:`apply` and :func:`apply_bound`
256 *************************************
246 *************************************
257
247
258 In terms of message classes, the MUX scheduler and Task scheduler relay the exact same
248 In terms of message classes, the MUX scheduler and Task scheduler relay the exact same
259 message types. Their only difference lies in how the destination is selected.
249 message types. Their only difference lies in how the destination is selected.
260
250
261 The `Namespace <http://gist.github.com/483294>`_ model suggests that execution be able to
251 The `Namespace <http://gist.github.com/483294>`_ model suggests that execution be able to
262 use the model::
252 use the model::
263
253
264 ns.apply(f, *args, **kwargs)
254 ns.apply(f, *args, **kwargs)
265
255
266 which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)``
256 which takes `f`, a function in the user's namespace, and executes ``f(*args, **kwargs)``
267 on a remote engine, returning the result (or, for non-blocking, information facilitating
257 on a remote engine, returning the result (or, for non-blocking, information facilitating
268 later retrieval of the result). This model, unlike the execute message which just uses a
258 later retrieval of the result). This model, unlike the execute message which just uses a
269 code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy
259 code string, must be able to send arbitrary (pickleable) Python objects. And ideally, copy
270 as little data as we can. The `buffers` property of a Message was introduced for this
260 as little data as we can. The `buffers` property of a Message was introduced for this
271 purpose.
261 purpose.
272
262
273 Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a
263 Utility method :func:`build_apply_message` in :mod:`IPython.zmq.streamsession` wraps a
274 function signature and builds a sendable buffer format for minimal data copying (exactly
264 function signature and builds a sendable buffer format for minimal data copying (exactly
275 zero copies of numpy array data or buffers or large strings).
265 zero copies of numpy array data or buffers or large strings).
276
266
277 Message type: ``apply_request``::
267 Message type: ``apply_request``::
278
268
279 content = {
269 content = {
280 'bound' : True, # whether to execute in the engine's namespace or unbound
270 'bound' : True, # whether to execute in the engine's namespace or unbound
281 'after' : ['msg_id',...], # list of msg_ids or output of Dependency.as_dict()
271 'after' : ['msg_id',...], # list of msg_ids or output of Dependency.as_dict()
282 'follow' : ['msg_id',...], # list of msg_ids or output of Dependency.as_dict()
272 'follow' : ['msg_id',...], # list of msg_ids or output of Dependency.as_dict()
283
273
284 }
274 }
285 buffers = ['...'] # at least 3 in length
275 buffers = ['...'] # at least 3 in length
286 # as built by build_apply_message(f,args,kwargs)
276 # as built by build_apply_message(f,args,kwargs)
287
277
288 after/follow represent task dependencies. 'after' corresponds to a time dependency. The
278 after/follow represent task dependencies. 'after' corresponds to a time dependency. The
289 request will not arrive at an engine until the 'after' dependency tasks have completed.
279 request will not arrive at an engine until the 'after' dependency tasks have completed.
290 'follow' corresponds to a location dependency. The task will be submitted to the same
280 'follow' corresponds to a location dependency. The task will be submitted to the same
291 engine as these msg_ids (see :class:`Dependency` docs for details).
281 engine as these msg_ids (see :class:`Dependency` docs for details).
292
282
293 Message type: ``apply_reply``::
283 Message type: ``apply_reply``::
294
284
295 content = {
285 content = {
296 'status' : 'ok' # 'ok' or 'error'
286 'status' : 'ok' # 'ok' or 'error'
297 # other error info here, as in other messages
287 # other error info here, as in other messages
298 }
288 }
299 buffers = ['...'] # either 1 or 2 in length
289 buffers = ['...'] # either 1 or 2 in length
300 # a serialization of the return value of f(*args,**kwargs)
290 # a serialization of the return value of f(*args,**kwargs)
301 # only populated if status is 'ok'
291 # only populated if status is 'ok'
302
292
303 All engine execution and data movement is performed via apply messages.
293 All engine execution and data movement is performed via apply messages.
304
294
305 Control Messages
295 Control Messages
306 ----------------
296 ----------------
307
297
308 Messages that interact with the engines, but are not meant to execute code, are submitted
298 Messages that interact with the engines, but are not meant to execute code, are submitted
309 via the Control queue. These messages have high priority, and are thus received and
299 via the Control queue. These messages have high priority, and are thus received and
310 handled before any execution requests.
300 handled before any execution requests.
311
301
312 Clients may want to clear the namespace on the engine. There are no arguments nor
302 Clients may want to clear the namespace on the engine. There are no arguments nor
313 information involved in this request, so the content is empty.
303 information involved in this request, so the content is empty.
314
304
315 Message type: ``clear_request``::
305 Message type: ``clear_request``::
316
306
317 content = {}
307 content = {}
318
308
319 Message type: ``clear_reply``::
309 Message type: ``clear_reply``::
320
310
321 content = {
311 content = {
322 'status' : 'ok' # 'ok' or 'error'
312 'status' : 'ok' # 'ok' or 'error'
323 # other error info here, as in other messages
313 # other error info here, as in other messages
324 }
314 }
325
315
326 Clients may want to abort tasks that have not yet run. This can by done by message id, or
316 Clients may want to abort tasks that have not yet run. This can by done by message id, or
327 all enqueued messages can be aborted if None is specified.
317 all enqueued messages can be aborted if None is specified.
328
318
329 Message type: ``abort_request``::
319 Message type: ``abort_request``::
330
320
331 content = {
321 content = {
332 'msg_ids' : ['1234-...', '...'] # list of msg_ids or None
322 'msg_ids' : ['1234-...', '...'] # list of msg_ids or None
333 }
323 }
334
324
335 Message type: ``abort_reply``::
325 Message type: ``abort_reply``::
336
326
337 content = {
327 content = {
338 'status' : 'ok' # 'ok' or 'error'
328 'status' : 'ok' # 'ok' or 'error'
339 # other error info here, as in other messages
329 # other error info here, as in other messages
340 }
330 }
341
331
342 The last action a client may want to do is shutdown the kernel. If a kernel receives a
332 The last action a client may want to do is shutdown the kernel. If a kernel receives a
343 shutdown request, then it aborts all queued messages, replies to the request, and exits.
333 shutdown request, then it aborts all queued messages, replies to the request, and exits.
344
334
345 Message type: ``shutdown_request``::
335 Message type: ``shutdown_request``::
346
336
347 content = {}
337 content = {}
348
338
349 Message type: ``shutdown_reply``::
339 Message type: ``shutdown_reply``::
350
340
351 content = {
341 content = {
352 'status' : 'ok' # 'ok' or 'error'
342 'status' : 'ok' # 'ok' or 'error'
353 # other error info here, as in other messages
343 # other error info here, as in other messages
354 }
344 }
355
345
356
346
357 Implementation
347 Implementation
358 --------------
348 --------------
359
349
360 There are a few differences in implementation between the `StreamSession` object used in
350 There are a few differences in implementation between the `StreamSession` object used in
361 the newparallel branch and the `Session` object, the main one being that messages are
351 the newparallel branch and the `Session` object, the main one being that messages are
362 sent in parts, rather than as a single serialized object. `StreamSession` objects also
352 sent in parts, rather than as a single serialized object. `StreamSession` objects also
363 take pack/unpack functions, which are to be used when serializing/deserializing objects.
353 take pack/unpack functions, which are to be used when serializing/deserializing objects.
364 These can be any functions that translate to/from formats that ZMQ sockets can send
354 These can be any functions that translate to/from formats that ZMQ sockets can send
365 (buffers,bytes, etc.).
355 (buffers,bytes, etc.).
366
356
367 Split Sends
357 Split Sends
368 ***********
358 ***********
369
359
370 Previously, messages were bundled as a single json object and one call to
360 Previously, messages were bundled as a single json object and one call to
371 :func:`socket.send_json`. Since the hub inspects all messages, and doesn't need to
361 :func:`socket.send_json`. Since the hub inspects all messages, and doesn't need to
372 see the content of the messages, which can be large, messages are now serialized and sent in
362 see the content of the messages, which can be large, messages are now serialized and sent in
373 pieces. All messages are sent in at least 3 parts: the header, the parent header, and the
363 pieces. All messages are sent in at least 3 parts: the header, the parent header, and the
374 content. This allows the controller to unpack and inspect the (always small) header,
364 content. This allows the controller to unpack and inspect the (always small) header,
375 without spending time unpacking the content unless the message is bound for the
365 without spending time unpacking the content unless the message is bound for the
376 controller. Buffers are added on to the end of the message, and can be any objects that
366 controller. Buffers are added on to the end of the message, and can be any objects that
377 present the buffer interface.
367 present the buffer interface.
378
368
General Comments 0
You need to be logged in to leave comments. Login now