##// END OF EJS Templates
stop using deprecated DelayedCallback...
MinRK -
Show More
@@ -1,173 +1,174 b''
1 """Manage IPython.parallel clusters in the notebook.
1 """Manage IPython.parallel clusters in the notebook.
2
2
3 Authors:
3 Authors:
4
4
5 * Brian Granger
5 * Brian Granger
6 """
6 """
7
7
8 #-----------------------------------------------------------------------------
8 #-----------------------------------------------------------------------------
9 # Copyright (C) 2008-2011 The IPython Development Team
9 # Copyright (C) 2008-2011 The IPython Development Team
10 #
10 #
11 # Distributed under the terms of the BSD License. The full license is in
11 # Distributed under the terms of the BSD License. The full license is in
12 # the file COPYING, distributed as part of this software.
12 # the file COPYING, distributed as part of this software.
13 #-----------------------------------------------------------------------------
13 #-----------------------------------------------------------------------------
14
14
15 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
16 # Imports
16 # Imports
17 #-----------------------------------------------------------------------------
17 #-----------------------------------------------------------------------------
18
18
19 from tornado import web
19 from tornado import web
20 from zmq.eventloop import ioloop
21
20
22 from IPython.config.configurable import LoggingConfigurable
21 from IPython.config.configurable import LoggingConfigurable
23 from IPython.utils.traitlets import Dict, Instance, CFloat
22 from IPython.utils.traitlets import Dict, Instance, Float
24 from IPython.core.profileapp import list_profiles_in
23 from IPython.core.profileapp import list_profiles_in
25 from IPython.core.profiledir import ProfileDir
24 from IPython.core.profiledir import ProfileDir
26 from IPython.utils import py3compat
25 from IPython.utils import py3compat
27 from IPython.utils.path import get_ipython_dir
26 from IPython.utils.path import get_ipython_dir
28
27
29
28
30 #-----------------------------------------------------------------------------
29 #-----------------------------------------------------------------------------
31 # Classes
30 # Classes
32 #-----------------------------------------------------------------------------
31 #-----------------------------------------------------------------------------
33
32
34
33
35
34
36
35
37 class ClusterManager(LoggingConfigurable):
36 class ClusterManager(LoggingConfigurable):
38
37
39 profiles = Dict()
38 profiles = Dict()
40
39
41 delay = CFloat(1., config=True,
40 delay = Float(1., config=True,
42 help="delay (in s) between starting the controller and the engines")
41 help="delay (in s) between starting the controller and the engines")
43
42
44 loop = Instance('zmq.eventloop.ioloop.IOLoop')
43 loop = Instance('zmq.eventloop.ioloop.IOLoop')
45 def _loop_default(self):
44 def _loop_default(self):
46 from zmq.eventloop.ioloop import IOLoop
45 from zmq.eventloop.ioloop import IOLoop
47 return IOLoop.instance()
46 return IOLoop.instance()
48
47
49 def build_launchers(self, profile_dir):
48 def build_launchers(self, profile_dir):
50 from IPython.parallel.apps.ipclusterapp import IPClusterStart
49 from IPython.parallel.apps.ipclusterapp import IPClusterStart
51
50
52 class DummyIPClusterStart(IPClusterStart):
51 class DummyIPClusterStart(IPClusterStart):
53 """Dummy subclass to skip init steps that conflict with global app.
52 """Dummy subclass to skip init steps that conflict with global app.
54
53
55 Instantiating and initializing this class should result in fully configured
54 Instantiating and initializing this class should result in fully configured
56 launchers, but no other side effects or state.
55 launchers, but no other side effects or state.
57 """
56 """
58
57
59 def init_signal(self):
58 def init_signal(self):
60 pass
59 pass
61 def reinit_logging(self):
60 def reinit_logging(self):
62 pass
61 pass
63
62
64 starter = DummyIPClusterStart(log=self.log)
63 starter = DummyIPClusterStart(log=self.log)
65 starter.initialize(['--profile-dir', profile_dir])
64 starter.initialize(['--profile-dir', profile_dir])
66 cl = starter.controller_launcher
65 cl = starter.controller_launcher
67 esl = starter.engine_launcher
66 esl = starter.engine_launcher
68 n = starter.n
67 n = starter.n
69 return cl, esl, n
68 return cl, esl, n
70
69
71 def get_profile_dir(self, name, path):
70 def get_profile_dir(self, name, path):
72 p = ProfileDir.find_profile_dir_by_name(path,name=name)
71 p = ProfileDir.find_profile_dir_by_name(path,name=name)
73 return p.location
72 return p.location
74
73
75 def update_profiles(self):
74 def update_profiles(self):
76 """List all profiles in the ipython_dir and cwd.
75 """List all profiles in the ipython_dir and cwd.
77 """
76 """
78 for path in [get_ipython_dir(), py3compat.getcwd()]:
77 for path in [get_ipython_dir(), py3compat.getcwd()]:
79 for profile in list_profiles_in(path):
78 for profile in list_profiles_in(path):
80 pd = self.get_profile_dir(profile, path)
79 pd = self.get_profile_dir(profile, path)
81 if profile not in self.profiles:
80 if profile not in self.profiles:
82 self.log.debug("Adding cluster profile '%s'" % profile)
81 self.log.debug("Adding cluster profile '%s'" % profile)
83 self.profiles[profile] = {
82 self.profiles[profile] = {
84 'profile': profile,
83 'profile': profile,
85 'profile_dir': pd,
84 'profile_dir': pd,
86 'status': 'stopped'
85 'status': 'stopped'
87 }
86 }
88
87
89 def list_profiles(self):
88 def list_profiles(self):
90 self.update_profiles()
89 self.update_profiles()
91 # sorted list, but ensure that 'default' always comes first
90 # sorted list, but ensure that 'default' always comes first
92 default_first = lambda name: name if name != 'default' else ''
91 default_first = lambda name: name if name != 'default' else ''
93 result = [self.profile_info(p) for p in sorted(self.profiles, key=default_first)]
92 result = [self.profile_info(p) for p in sorted(self.profiles, key=default_first)]
94 return result
93 return result
95
94
96 def check_profile(self, profile):
95 def check_profile(self, profile):
97 if profile not in self.profiles:
96 if profile not in self.profiles:
98 raise web.HTTPError(404, u'profile not found')
97 raise web.HTTPError(404, u'profile not found')
99
98
100 def profile_info(self, profile):
99 def profile_info(self, profile):
101 self.check_profile(profile)
100 self.check_profile(profile)
102 result = {}
101 result = {}
103 data = self.profiles.get(profile)
102 data = self.profiles.get(profile)
104 result['profile'] = profile
103 result['profile'] = profile
105 result['profile_dir'] = data['profile_dir']
104 result['profile_dir'] = data['profile_dir']
106 result['status'] = data['status']
105 result['status'] = data['status']
107 if 'n' in data:
106 if 'n' in data:
108 result['n'] = data['n']
107 result['n'] = data['n']
109 return result
108 return result
110
109
111 def start_cluster(self, profile, n=None):
110 def start_cluster(self, profile, n=None):
112 """Start a cluster for a given profile."""
111 """Start a cluster for a given profile."""
113 self.check_profile(profile)
112 self.check_profile(profile)
114 data = self.profiles[profile]
113 data = self.profiles[profile]
115 if data['status'] == 'running':
114 if data['status'] == 'running':
116 raise web.HTTPError(409, u'cluster already running')
115 raise web.HTTPError(409, u'cluster already running')
117 cl, esl, default_n = self.build_launchers(data['profile_dir'])
116 cl, esl, default_n = self.build_launchers(data['profile_dir'])
118 n = n if n is not None else default_n
117 n = n if n is not None else default_n
119 def clean_data():
118 def clean_data():
120 data.pop('controller_launcher',None)
119 data.pop('controller_launcher',None)
121 data.pop('engine_set_launcher',None)
120 data.pop('engine_set_launcher',None)
122 data.pop('n',None)
121 data.pop('n',None)
123 data['status'] = 'stopped'
122 data['status'] = 'stopped'
124 def engines_stopped(r):
123 def engines_stopped(r):
125 self.log.debug('Engines stopped')
124 self.log.debug('Engines stopped')
126 if cl.running:
125 if cl.running:
127 cl.stop()
126 cl.stop()
128 clean_data()
127 clean_data()
129 esl.on_stop(engines_stopped)
128 esl.on_stop(engines_stopped)
130 def controller_stopped(r):
129 def controller_stopped(r):
131 self.log.debug('Controller stopped')
130 self.log.debug('Controller stopped')
132 if esl.running:
131 if esl.running:
133 esl.stop()
132 esl.stop()
134 clean_data()
133 clean_data()
135 cl.on_stop(controller_stopped)
134 cl.on_stop(controller_stopped)
136
135 loop = self.loop
137 dc = ioloop.DelayedCallback(lambda: cl.start(), 0, self.loop)
136
138 dc.start()
137 def start():
139 dc = ioloop.DelayedCallback(lambda: esl.start(n), 1000*self.delay, self.loop)
138 """start the controller, then the engines after a delay"""
140 dc.start()
139 cl.start()
140 loop.add_timeout(self.loop.time() + self.delay, lambda : esl.start(n))
141 self.loop.add_callback(start)
141
142
142 self.log.debug('Cluster started')
143 self.log.debug('Cluster started')
143 data['controller_launcher'] = cl
144 data['controller_launcher'] = cl
144 data['engine_set_launcher'] = esl
145 data['engine_set_launcher'] = esl
145 data['n'] = n
146 data['n'] = n
146 data['status'] = 'running'
147 data['status'] = 'running'
147 return self.profile_info(profile)
148 return self.profile_info(profile)
148
149
149 def stop_cluster(self, profile):
150 def stop_cluster(self, profile):
150 """Stop a cluster for a given profile."""
151 """Stop a cluster for a given profile."""
151 self.check_profile(profile)
152 self.check_profile(profile)
152 data = self.profiles[profile]
153 data = self.profiles[profile]
153 if data['status'] == 'stopped':
154 if data['status'] == 'stopped':
154 raise web.HTTPError(409, u'cluster not running')
155 raise web.HTTPError(409, u'cluster not running')
155 data = self.profiles[profile]
156 data = self.profiles[profile]
156 cl = data['controller_launcher']
157 cl = data['controller_launcher']
157 esl = data['engine_set_launcher']
158 esl = data['engine_set_launcher']
158 if cl.running:
159 if cl.running:
159 cl.stop()
160 cl.stop()
160 if esl.running:
161 if esl.running:
161 esl.stop()
162 esl.stop()
162 # Return a temp info dict, the real one is updated in the on_stop
163 # Return a temp info dict, the real one is updated in the on_stop
163 # logic above.
164 # logic above.
164 result = {
165 result = {
165 'profile': data['profile'],
166 'profile': data['profile'],
166 'profile_dir': data['profile_dir'],
167 'profile_dir': data['profile_dir'],
167 'status': 'stopped'
168 'status': 'stopped'
168 }
169 }
169 return result
170 return result
170
171
171 def stop_all_clusters(self):
172 def stop_all_clusters(self):
172 for p in self.profiles.keys():
173 for p in self.profiles.keys():
173 self.stop_cluster(p)
174 self.stop_cluster(p)
@@ -1,260 +1,242 b''
1 # encoding: utf-8
1 # encoding: utf-8
2 """
2 """
3 The Base Application class for IPython.parallel apps
3 The Base Application class for IPython.parallel apps
4
5 Authors:
6
7 * Brian Granger
8 * Min RK
9
10 """
4 """
11
5
12 #-----------------------------------------------------------------------------
13 # Copyright (C) 2008-2011 The IPython Development Team
14 #
15 # Distributed under the terms of the BSD License. The full license is in
16 # the file COPYING, distributed as part of this software.
17 #-----------------------------------------------------------------------------
18
19 #-----------------------------------------------------------------------------
20 # Imports
21 #-----------------------------------------------------------------------------
22
6
23 import os
7 import os
24 import logging
8 import logging
25 import re
9 import re
26 import sys
10 import sys
27
11
28 from subprocess import Popen, PIPE
29
30 from IPython.config.application import catch_config_error, LevelFormatter
12 from IPython.config.application import catch_config_error, LevelFormatter
31 from IPython.core import release
13 from IPython.core import release
32 from IPython.core.crashhandler import CrashHandler
14 from IPython.core.crashhandler import CrashHandler
33 from IPython.core.application import (
15 from IPython.core.application import (
34 BaseIPythonApplication,
16 BaseIPythonApplication,
35 base_aliases as base_ip_aliases,
17 base_aliases as base_ip_aliases,
36 base_flags as base_ip_flags
18 base_flags as base_ip_flags
37 )
19 )
38 from IPython.utils.path import expand_path
20 from IPython.utils.path import expand_path
39 from IPython.utils.process import check_pid
21 from IPython.utils.process import check_pid
40 from IPython.utils import py3compat
22 from IPython.utils import py3compat
41 from IPython.utils.py3compat import unicode_type
23 from IPython.utils.py3compat import unicode_type
42
24
43 from IPython.utils.traitlets import Unicode, Bool, Instance, Dict
25 from IPython.utils.traitlets import Unicode, Bool, Instance, Dict
44
26
45 #-----------------------------------------------------------------------------
27 #-----------------------------------------------------------------------------
46 # Module errors
28 # Module errors
47 #-----------------------------------------------------------------------------
29 #-----------------------------------------------------------------------------
48
30
49 class PIDFileError(Exception):
31 class PIDFileError(Exception):
50 pass
32 pass
51
33
52
34
53 #-----------------------------------------------------------------------------
35 #-----------------------------------------------------------------------------
54 # Crash handler for this application
36 # Crash handler for this application
55 #-----------------------------------------------------------------------------
37 #-----------------------------------------------------------------------------
56
38
57 class ParallelCrashHandler(CrashHandler):
39 class ParallelCrashHandler(CrashHandler):
58 """sys.excepthook for IPython itself, leaves a detailed report on disk."""
40 """sys.excepthook for IPython itself, leaves a detailed report on disk."""
59
41
60 def __init__(self, app):
42 def __init__(self, app):
61 contact_name = release.authors['Min'][0]
43 contact_name = release.authors['Min'][0]
62 contact_email = release.author_email
44 contact_email = release.author_email
63 bug_tracker = 'https://github.com/ipython/ipython/issues'
45 bug_tracker = 'https://github.com/ipython/ipython/issues'
64 super(ParallelCrashHandler,self).__init__(
46 super(ParallelCrashHandler,self).__init__(
65 app, contact_name, contact_email, bug_tracker
47 app, contact_name, contact_email, bug_tracker
66 )
48 )
67
49
68
50
69 #-----------------------------------------------------------------------------
51 #-----------------------------------------------------------------------------
70 # Main application
52 # Main application
71 #-----------------------------------------------------------------------------
53 #-----------------------------------------------------------------------------
72 base_aliases = {}
54 base_aliases = {}
73 base_aliases.update(base_ip_aliases)
55 base_aliases.update(base_ip_aliases)
74 base_aliases.update({
56 base_aliases.update({
75 'work-dir' : 'BaseParallelApplication.work_dir',
57 'work-dir' : 'BaseParallelApplication.work_dir',
76 'log-to-file' : 'BaseParallelApplication.log_to_file',
58 'log-to-file' : 'BaseParallelApplication.log_to_file',
77 'clean-logs' : 'BaseParallelApplication.clean_logs',
59 'clean-logs' : 'BaseParallelApplication.clean_logs',
78 'log-url' : 'BaseParallelApplication.log_url',
60 'log-url' : 'BaseParallelApplication.log_url',
79 'cluster-id' : 'BaseParallelApplication.cluster_id',
61 'cluster-id' : 'BaseParallelApplication.cluster_id',
80 })
62 })
81
63
82 base_flags = {
64 base_flags = {
83 'log-to-file' : (
65 'log-to-file' : (
84 {'BaseParallelApplication' : {'log_to_file' : True}},
66 {'BaseParallelApplication' : {'log_to_file' : True}},
85 "send log output to a file"
67 "send log output to a file"
86 )
68 )
87 }
69 }
88 base_flags.update(base_ip_flags)
70 base_flags.update(base_ip_flags)
89
71
90 class BaseParallelApplication(BaseIPythonApplication):
72 class BaseParallelApplication(BaseIPythonApplication):
91 """The base Application for IPython.parallel apps
73 """The base Application for IPython.parallel apps
92
74
93 Principle extensions to BaseIPyythonApplication:
75 Principle extensions to BaseIPyythonApplication:
94
76
95 * work_dir
77 * work_dir
96 * remote logging via pyzmq
78 * remote logging via pyzmq
97 * IOLoop instance
79 * IOLoop instance
98 """
80 """
99
81
100 crash_handler_class = ParallelCrashHandler
82 crash_handler_class = ParallelCrashHandler
101
83
102 def _log_level_default(self):
84 def _log_level_default(self):
103 # temporarily override default_log_level to INFO
85 # temporarily override default_log_level to INFO
104 return logging.INFO
86 return logging.INFO
105
87
106 def _log_format_default(self):
88 def _log_format_default(self):
107 """override default log format to include time"""
89 """override default log format to include time"""
108 return u"%(asctime)s.%(msecs).03d [%(name)s]%(highlevel)s %(message)s"
90 return u"%(asctime)s.%(msecs).03d [%(name)s]%(highlevel)s %(message)s"
109
91
110 work_dir = Unicode(py3compat.getcwd(), config=True,
92 work_dir = Unicode(py3compat.getcwd(), config=True,
111 help='Set the working dir for the process.'
93 help='Set the working dir for the process.'
112 )
94 )
113 def _work_dir_changed(self, name, old, new):
95 def _work_dir_changed(self, name, old, new):
114 self.work_dir = unicode_type(expand_path(new))
96 self.work_dir = unicode_type(expand_path(new))
115
97
116 log_to_file = Bool(config=True,
98 log_to_file = Bool(config=True,
117 help="whether to log to a file")
99 help="whether to log to a file")
118
100
119 clean_logs = Bool(False, config=True,
101 clean_logs = Bool(False, config=True,
120 help="whether to cleanup old logfiles before starting")
102 help="whether to cleanup old logfiles before starting")
121
103
122 log_url = Unicode('', config=True,
104 log_url = Unicode('', config=True,
123 help="The ZMQ URL of the iplogger to aggregate logging.")
105 help="The ZMQ URL of the iplogger to aggregate logging.")
124
106
125 cluster_id = Unicode('', config=True,
107 cluster_id = Unicode('', config=True,
126 help="""String id to add to runtime files, to prevent name collisions when
108 help="""String id to add to runtime files, to prevent name collisions when
127 using multiple clusters with a single profile simultaneously.
109 using multiple clusters with a single profile simultaneously.
128
110
129 When set, files will be named like: 'ipcontroller-<cluster_id>-engine.json'
111 When set, files will be named like: 'ipcontroller-<cluster_id>-engine.json'
130
112
131 Since this is text inserted into filenames, typical recommendations apply:
113 Since this is text inserted into filenames, typical recommendations apply:
132 Simple character strings are ideal, and spaces are not recommended (but should
114 Simple character strings are ideal, and spaces are not recommended (but should
133 generally work).
115 generally work).
134 """
116 """
135 )
117 )
136 def _cluster_id_changed(self, name, old, new):
118 def _cluster_id_changed(self, name, old, new):
137 self.name = self.__class__.name
119 self.name = self.__class__.name
138 if new:
120 if new:
139 self.name += '-%s'%new
121 self.name += '-%s'%new
140
122
141 def _config_files_default(self):
123 def _config_files_default(self):
142 return ['ipcontroller_config.py', 'ipengine_config.py', 'ipcluster_config.py']
124 return ['ipcontroller_config.py', 'ipengine_config.py', 'ipcluster_config.py']
143
125
144 loop = Instance('zmq.eventloop.ioloop.IOLoop')
126 loop = Instance('zmq.eventloop.ioloop.IOLoop')
145 def _loop_default(self):
127 def _loop_default(self):
146 from zmq.eventloop.ioloop import IOLoop
128 from zmq.eventloop.ioloop import IOLoop
147 return IOLoop.instance()
129 return IOLoop.instance()
148
130
149 aliases = Dict(base_aliases)
131 aliases = Dict(base_aliases)
150 flags = Dict(base_flags)
132 flags = Dict(base_flags)
151
133
152 @catch_config_error
134 @catch_config_error
153 def initialize(self, argv=None):
135 def initialize(self, argv=None):
154 """initialize the app"""
136 """initialize the app"""
155 super(BaseParallelApplication, self).initialize(argv)
137 super(BaseParallelApplication, self).initialize(argv)
156 self.to_work_dir()
138 self.to_work_dir()
157 self.reinit_logging()
139 self.reinit_logging()
158
140
159 def to_work_dir(self):
141 def to_work_dir(self):
160 wd = self.work_dir
142 wd = self.work_dir
161 if unicode_type(wd) != py3compat.getcwd():
143 if unicode_type(wd) != py3compat.getcwd():
162 os.chdir(wd)
144 os.chdir(wd)
163 self.log.info("Changing to working dir: %s" % wd)
145 self.log.info("Changing to working dir: %s" % wd)
164 # This is the working dir by now.
146 # This is the working dir by now.
165 sys.path.insert(0, '')
147 sys.path.insert(0, '')
166
148
167 def reinit_logging(self):
149 def reinit_logging(self):
168 # Remove old log files
150 # Remove old log files
169 log_dir = self.profile_dir.log_dir
151 log_dir = self.profile_dir.log_dir
170 if self.clean_logs:
152 if self.clean_logs:
171 for f in os.listdir(log_dir):
153 for f in os.listdir(log_dir):
172 if re.match(r'%s-\d+\.(log|err|out)' % self.name, f):
154 if re.match(r'%s-\d+\.(log|err|out)' % self.name, f):
173 try:
155 try:
174 os.remove(os.path.join(log_dir, f))
156 os.remove(os.path.join(log_dir, f))
175 except (OSError, IOError):
157 except (OSError, IOError):
176 # probably just conflict from sibling process
158 # probably just conflict from sibling process
177 # already removing it
159 # already removing it
178 pass
160 pass
179 if self.log_to_file:
161 if self.log_to_file:
180 # Start logging to the new log file
162 # Start logging to the new log file
181 log_filename = self.name + u'-' + str(os.getpid()) + u'.log'
163 log_filename = self.name + u'-' + str(os.getpid()) + u'.log'
182 logfile = os.path.join(log_dir, log_filename)
164 logfile = os.path.join(log_dir, log_filename)
183 open_log_file = open(logfile, 'w')
165 open_log_file = open(logfile, 'w')
184 else:
166 else:
185 open_log_file = None
167 open_log_file = None
186 if open_log_file is not None:
168 if open_log_file is not None:
187 while self.log.handlers:
169 while self.log.handlers:
188 self.log.removeHandler(self.log.handlers[0])
170 self.log.removeHandler(self.log.handlers[0])
189 self._log_handler = logging.StreamHandler(open_log_file)
171 self._log_handler = logging.StreamHandler(open_log_file)
190 self.log.addHandler(self._log_handler)
172 self.log.addHandler(self._log_handler)
191 else:
173 else:
192 self._log_handler = self.log.handlers[0]
174 self._log_handler = self.log.handlers[0]
193 # Add timestamps to log format:
175 # Add timestamps to log format:
194 self._log_formatter = LevelFormatter(self.log_format,
176 self._log_formatter = LevelFormatter(self.log_format,
195 datefmt=self.log_datefmt)
177 datefmt=self.log_datefmt)
196 self._log_handler.setFormatter(self._log_formatter)
178 self._log_handler.setFormatter(self._log_formatter)
197 # do not propagate log messages to root logger
179 # do not propagate log messages to root logger
198 # ipcluster app will sometimes print duplicate messages during shutdown
180 # ipcluster app will sometimes print duplicate messages during shutdown
199 # if this is 1 (default):
181 # if this is 1 (default):
200 self.log.propagate = False
182 self.log.propagate = False
201
183
202 def write_pid_file(self, overwrite=False):
184 def write_pid_file(self, overwrite=False):
203 """Create a .pid file in the pid_dir with my pid.
185 """Create a .pid file in the pid_dir with my pid.
204
186
205 This must be called after pre_construct, which sets `self.pid_dir`.
187 This must be called after pre_construct, which sets `self.pid_dir`.
206 This raises :exc:`PIDFileError` if the pid file exists already.
188 This raises :exc:`PIDFileError` if the pid file exists already.
207 """
189 """
208 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
190 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
209 if os.path.isfile(pid_file):
191 if os.path.isfile(pid_file):
210 pid = self.get_pid_from_file()
192 pid = self.get_pid_from_file()
211 if not overwrite:
193 if not overwrite:
212 raise PIDFileError(
194 raise PIDFileError(
213 'The pid file [%s] already exists. \nThis could mean that this '
195 'The pid file [%s] already exists. \nThis could mean that this '
214 'server is already running with [pid=%s].' % (pid_file, pid)
196 'server is already running with [pid=%s].' % (pid_file, pid)
215 )
197 )
216 with open(pid_file, 'w') as f:
198 with open(pid_file, 'w') as f:
217 self.log.info("Creating pid file: %s" % pid_file)
199 self.log.info("Creating pid file: %s" % pid_file)
218 f.write(repr(os.getpid())+'\n')
200 f.write(repr(os.getpid())+'\n')
219
201
220 def remove_pid_file(self):
202 def remove_pid_file(self):
221 """Remove the pid file.
203 """Remove the pid file.
222
204
223 This should be called at shutdown by registering a callback with
205 This should be called at shutdown by registering a callback with
224 :func:`reactor.addSystemEventTrigger`. This needs to return
206 :func:`reactor.addSystemEventTrigger`. This needs to return
225 ``None``.
207 ``None``.
226 """
208 """
227 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
209 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
228 if os.path.isfile(pid_file):
210 if os.path.isfile(pid_file):
229 try:
211 try:
230 self.log.info("Removing pid file: %s" % pid_file)
212 self.log.info("Removing pid file: %s" % pid_file)
231 os.remove(pid_file)
213 os.remove(pid_file)
232 except:
214 except:
233 self.log.warn("Error removing the pid file: %s" % pid_file)
215 self.log.warn("Error removing the pid file: %s" % pid_file)
234
216
235 def get_pid_from_file(self):
217 def get_pid_from_file(self):
236 """Get the pid from the pid file.
218 """Get the pid from the pid file.
237
219
238 If the pid file doesn't exist a :exc:`PIDFileError` is raised.
220 If the pid file doesn't exist a :exc:`PIDFileError` is raised.
239 """
221 """
240 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
222 pid_file = os.path.join(self.profile_dir.pid_dir, self.name + u'.pid')
241 if os.path.isfile(pid_file):
223 if os.path.isfile(pid_file):
242 with open(pid_file, 'r') as f:
224 with open(pid_file, 'r') as f:
243 s = f.read().strip()
225 s = f.read().strip()
244 try:
226 try:
245 pid = int(s)
227 pid = int(s)
246 except:
228 except:
247 raise PIDFileError("invalid pid file: %s (contents: %r)"%(pid_file, s))
229 raise PIDFileError("invalid pid file: %s (contents: %r)"%(pid_file, s))
248 return pid
230 return pid
249 else:
231 else:
250 raise PIDFileError('pid file not found: %s' % pid_file)
232 raise PIDFileError('pid file not found: %s' % pid_file)
251
233
252 def check_pid(self, pid):
234 def check_pid(self, pid):
253 try:
235 try:
254 return check_pid(pid)
236 return check_pid(pid)
255 except Exception:
237 except Exception:
256 self.log.warn(
238 self.log.warn(
257 "Could not determine whether pid %i is running. "
239 "Could not determine whether pid %i is running. "
258 " Making the likely assumption that it is."%pid
240 " Making the likely assumption that it is."%pid
259 )
241 )
260 return True
242 return True
@@ -1,618 +1,596 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 """
3 """The ipcluster application."""
4 The ipcluster application.
5
6 Authors:
7
8 * Brian Granger
9 * MinRK
10
11 """
12 from __future__ import print_function
4 from __future__ import print_function
13
5
14 #-----------------------------------------------------------------------------
15 # Copyright (C) 2008-2011 The IPython Development Team
16 #
17 # Distributed under the terms of the BSD License. The full license is in
18 # the file COPYING, distributed as part of this software.
19 #-----------------------------------------------------------------------------
20
21 #-----------------------------------------------------------------------------
22 # Imports
23 #-----------------------------------------------------------------------------
24
25 import errno
6 import errno
26 import logging
7 import logging
27 import os
8 import os
28 import re
9 import re
29 import signal
10 import signal
30
11
31 from subprocess import check_call, CalledProcessError, PIPE
12 from subprocess import check_call, CalledProcessError, PIPE
32 import zmq
13 import zmq
33 from zmq.eventloop import ioloop
34
14
35 from IPython.config.application import Application, boolean_flag, catch_config_error
15 from IPython.config.application import catch_config_error
36 from IPython.config.loader import Config
16 from IPython.config.loader import Config
37 from IPython.core.application import BaseIPythonApplication
17 from IPython.core.application import BaseIPythonApplication
38 from IPython.core.profiledir import ProfileDir
18 from IPython.core.profiledir import ProfileDir
39 from IPython.utils.daemonize import daemonize
19 from IPython.utils.daemonize import daemonize
40 from IPython.utils.importstring import import_item
20 from IPython.utils.importstring import import_item
41 from IPython.utils.py3compat import string_types
21 from IPython.utils.py3compat import string_types
42 from IPython.utils.sysinfo import num_cpus
22 from IPython.utils.sysinfo import num_cpus
43 from IPython.utils.traitlets import (Integer, Unicode, Bool, CFloat, Dict, List, Any,
23 from IPython.utils.traitlets import (Integer, Unicode, Bool, CFloat, Dict, List, Any,
44 DottedObjectName)
24 DottedObjectName)
45
25
46 from IPython.parallel.apps.baseapp import (
26 from IPython.parallel.apps.baseapp import (
47 BaseParallelApplication,
27 BaseParallelApplication,
48 PIDFileError,
28 PIDFileError,
49 base_flags, base_aliases
29 base_flags, base_aliases
50 )
30 )
51
31
52
32
53 #-----------------------------------------------------------------------------
33 #-----------------------------------------------------------------------------
54 # Module level variables
34 # Module level variables
55 #-----------------------------------------------------------------------------
35 #-----------------------------------------------------------------------------
56
36
57
37
58 _description = """Start an IPython cluster for parallel computing.
38 _description = """Start an IPython cluster for parallel computing.
59
39
60 An IPython cluster consists of 1 controller and 1 or more engines.
40 An IPython cluster consists of 1 controller and 1 or more engines.
61 This command automates the startup of these processes using a wide range of
41 This command automates the startup of these processes using a wide range of
62 startup methods (SSH, local processes, PBS, mpiexec, SGE, LSF, HTCondor,
42 startup methods (SSH, local processes, PBS, mpiexec, SGE, LSF, HTCondor,
63 Windows HPC Server 2008). To start a cluster with 4 engines on your
43 Windows HPC Server 2008). To start a cluster with 4 engines on your
64 local host simply do 'ipcluster start --n=4'. For more complex usage
44 local host simply do 'ipcluster start --n=4'. For more complex usage
65 you will typically do 'ipython profile create mycluster --parallel', then edit
45 you will typically do 'ipython profile create mycluster --parallel', then edit
66 configuration files, followed by 'ipcluster start --profile=mycluster --n=4'.
46 configuration files, followed by 'ipcluster start --profile=mycluster --n=4'.
67 """
47 """
68
48
69 _main_examples = """
49 _main_examples = """
70 ipcluster start --n=4 # start a 4 node cluster on localhost
50 ipcluster start --n=4 # start a 4 node cluster on localhost
71 ipcluster start -h # show the help string for the start subcmd
51 ipcluster start -h # show the help string for the start subcmd
72
52
73 ipcluster stop -h # show the help string for the stop subcmd
53 ipcluster stop -h # show the help string for the stop subcmd
74 ipcluster engines -h # show the help string for the engines subcmd
54 ipcluster engines -h # show the help string for the engines subcmd
75 """
55 """
76
56
77 _start_examples = """
57 _start_examples = """
78 ipython profile create mycluster --parallel # create mycluster profile
58 ipython profile create mycluster --parallel # create mycluster profile
79 ipcluster start --profile=mycluster --n=4 # start mycluster with 4 nodes
59 ipcluster start --profile=mycluster --n=4 # start mycluster with 4 nodes
80 """
60 """
81
61
82 _stop_examples = """
62 _stop_examples = """
83 ipcluster stop --profile=mycluster # stop a running cluster by profile name
63 ipcluster stop --profile=mycluster # stop a running cluster by profile name
84 """
64 """
85
65
86 _engines_examples = """
66 _engines_examples = """
87 ipcluster engines --profile=mycluster --n=4 # start 4 engines only
67 ipcluster engines --profile=mycluster --n=4 # start 4 engines only
88 """
68 """
89
69
90
70
91 # Exit codes for ipcluster
71 # Exit codes for ipcluster
92
72
93 # This will be the exit code if the ipcluster appears to be running because
73 # This will be the exit code if the ipcluster appears to be running because
94 # a .pid file exists
74 # a .pid file exists
95 ALREADY_STARTED = 10
75 ALREADY_STARTED = 10
96
76
97
77
98 # This will be the exit code if ipcluster stop is run, but there is not .pid
78 # This will be the exit code if ipcluster stop is run, but there is not .pid
99 # file to be found.
79 # file to be found.
100 ALREADY_STOPPED = 11
80 ALREADY_STOPPED = 11
101
81
102 # This will be the exit code if ipcluster engines is run, but there is not .pid
82 # This will be the exit code if ipcluster engines is run, but there is not .pid
103 # file to be found.
83 # file to be found.
104 NO_CLUSTER = 12
84 NO_CLUSTER = 12
105
85
106
86
107 #-----------------------------------------------------------------------------
87 #-----------------------------------------------------------------------------
108 # Utilities
88 # Utilities
109 #-----------------------------------------------------------------------------
89 #-----------------------------------------------------------------------------
110
90
111 def find_launcher_class(clsname, kind):
91 def find_launcher_class(clsname, kind):
112 """Return a launcher for a given clsname and kind.
92 """Return a launcher for a given clsname and kind.
113
93
114 Parameters
94 Parameters
115 ==========
95 ==========
116 clsname : str
96 clsname : str
117 The full name of the launcher class, either with or without the
97 The full name of the launcher class, either with or without the
118 module path, or an abbreviation (MPI, SSH, SGE, PBS, LSF, HTCondor
98 module path, or an abbreviation (MPI, SSH, SGE, PBS, LSF, HTCondor
119 WindowsHPC).
99 WindowsHPC).
120 kind : str
100 kind : str
121 Either 'EngineSet' or 'Controller'.
101 Either 'EngineSet' or 'Controller'.
122 """
102 """
123 if '.' not in clsname:
103 if '.' not in clsname:
124 # not a module, presume it's the raw name in apps.launcher
104 # not a module, presume it's the raw name in apps.launcher
125 if kind and kind not in clsname:
105 if kind and kind not in clsname:
126 # doesn't match necessary full class name, assume it's
106 # doesn't match necessary full class name, assume it's
127 # just 'PBS' or 'MPI' etc prefix:
107 # just 'PBS' or 'MPI' etc prefix:
128 clsname = clsname + kind + 'Launcher'
108 clsname = clsname + kind + 'Launcher'
129 clsname = 'IPython.parallel.apps.launcher.'+clsname
109 clsname = 'IPython.parallel.apps.launcher.'+clsname
130 klass = import_item(clsname)
110 klass = import_item(clsname)
131 return klass
111 return klass
132
112
133 #-----------------------------------------------------------------------------
113 #-----------------------------------------------------------------------------
134 # Main application
114 # Main application
135 #-----------------------------------------------------------------------------
115 #-----------------------------------------------------------------------------
136
116
137 start_help = """Start an IPython cluster for parallel computing
117 start_help = """Start an IPython cluster for parallel computing
138
118
139 Start an ipython cluster by its profile name or cluster
119 Start an ipython cluster by its profile name or cluster
140 directory. Cluster directories contain configuration, log and
120 directory. Cluster directories contain configuration, log and
141 security related files and are named using the convention
121 security related files and are named using the convention
142 'profile_<name>' and should be creating using the 'start'
122 'profile_<name>' and should be creating using the 'start'
143 subcommand of 'ipcluster'. If your cluster directory is in
123 subcommand of 'ipcluster'. If your cluster directory is in
144 the cwd or the ipython directory, you can simply refer to it
124 the cwd or the ipython directory, you can simply refer to it
145 using its profile name, 'ipcluster start --n=4 --profile=<profile>`,
125 using its profile name, 'ipcluster start --n=4 --profile=<profile>`,
146 otherwise use the 'profile-dir' option.
126 otherwise use the 'profile-dir' option.
147 """
127 """
148 stop_help = """Stop a running IPython cluster
128 stop_help = """Stop a running IPython cluster
149
129
150 Stop a running ipython cluster by its profile name or cluster
130 Stop a running ipython cluster by its profile name or cluster
151 directory. Cluster directories are named using the convention
131 directory. Cluster directories are named using the convention
152 'profile_<name>'. If your cluster directory is in
132 'profile_<name>'. If your cluster directory is in
153 the cwd or the ipython directory, you can simply refer to it
133 the cwd or the ipython directory, you can simply refer to it
154 using its profile name, 'ipcluster stop --profile=<profile>`, otherwise
134 using its profile name, 'ipcluster stop --profile=<profile>`, otherwise
155 use the '--profile-dir' option.
135 use the '--profile-dir' option.
156 """
136 """
157 engines_help = """Start engines connected to an existing IPython cluster
137 engines_help = """Start engines connected to an existing IPython cluster
158
138
159 Start one or more engines to connect to an existing Cluster
139 Start one or more engines to connect to an existing Cluster
160 by profile name or cluster directory.
140 by profile name or cluster directory.
161 Cluster directories contain configuration, log and
141 Cluster directories contain configuration, log and
162 security related files and are named using the convention
142 security related files and are named using the convention
163 'profile_<name>' and should be creating using the 'start'
143 'profile_<name>' and should be creating using the 'start'
164 subcommand of 'ipcluster'. If your cluster directory is in
144 subcommand of 'ipcluster'. If your cluster directory is in
165 the cwd or the ipython directory, you can simply refer to it
145 the cwd or the ipython directory, you can simply refer to it
166 using its profile name, 'ipcluster engines --n=4 --profile=<profile>`,
146 using its profile name, 'ipcluster engines --n=4 --profile=<profile>`,
167 otherwise use the 'profile-dir' option.
147 otherwise use the 'profile-dir' option.
168 """
148 """
169 stop_aliases = dict(
149 stop_aliases = dict(
170 signal='IPClusterStop.signal',
150 signal='IPClusterStop.signal',
171 )
151 )
172 stop_aliases.update(base_aliases)
152 stop_aliases.update(base_aliases)
173
153
174 class IPClusterStop(BaseParallelApplication):
154 class IPClusterStop(BaseParallelApplication):
175 name = u'ipcluster'
155 name = u'ipcluster'
176 description = stop_help
156 description = stop_help
177 examples = _stop_examples
157 examples = _stop_examples
178
158
179 signal = Integer(signal.SIGINT, config=True,
159 signal = Integer(signal.SIGINT, config=True,
180 help="signal to use for stopping processes.")
160 help="signal to use for stopping processes.")
181
161
182 aliases = Dict(stop_aliases)
162 aliases = Dict(stop_aliases)
183
163
184 def start(self):
164 def start(self):
185 """Start the app for the stop subcommand."""
165 """Start the app for the stop subcommand."""
186 try:
166 try:
187 pid = self.get_pid_from_file()
167 pid = self.get_pid_from_file()
188 except PIDFileError:
168 except PIDFileError:
189 self.log.critical(
169 self.log.critical(
190 'Could not read pid file, cluster is probably not running.'
170 'Could not read pid file, cluster is probably not running.'
191 )
171 )
192 # Here I exit with a unusual exit status that other processes
172 # Here I exit with a unusual exit status that other processes
193 # can watch for to learn how I existed.
173 # can watch for to learn how I existed.
194 self.remove_pid_file()
174 self.remove_pid_file()
195 self.exit(ALREADY_STOPPED)
175 self.exit(ALREADY_STOPPED)
196
176
197 if not self.check_pid(pid):
177 if not self.check_pid(pid):
198 self.log.critical(
178 self.log.critical(
199 'Cluster [pid=%r] is not running.' % pid
179 'Cluster [pid=%r] is not running.' % pid
200 )
180 )
201 self.remove_pid_file()
181 self.remove_pid_file()
202 # Here I exit with a unusual exit status that other processes
182 # Here I exit with a unusual exit status that other processes
203 # can watch for to learn how I existed.
183 # can watch for to learn how I existed.
204 self.exit(ALREADY_STOPPED)
184 self.exit(ALREADY_STOPPED)
205
185
206 elif os.name=='posix':
186 elif os.name=='posix':
207 sig = self.signal
187 sig = self.signal
208 self.log.info(
188 self.log.info(
209 "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
189 "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
210 )
190 )
211 try:
191 try:
212 os.kill(pid, sig)
192 os.kill(pid, sig)
213 except OSError:
193 except OSError:
214 self.log.error("Stopping cluster failed, assuming already dead.",
194 self.log.error("Stopping cluster failed, assuming already dead.",
215 exc_info=True)
195 exc_info=True)
216 self.remove_pid_file()
196 self.remove_pid_file()
217 elif os.name=='nt':
197 elif os.name=='nt':
218 try:
198 try:
219 # kill the whole tree
199 # kill the whole tree
220 p = check_call(['taskkill', '-pid', str(pid), '-t', '-f'], stdout=PIPE,stderr=PIPE)
200 p = check_call(['taskkill', '-pid', str(pid), '-t', '-f'], stdout=PIPE,stderr=PIPE)
221 except (CalledProcessError, OSError):
201 except (CalledProcessError, OSError):
222 self.log.error("Stopping cluster failed, assuming already dead.",
202 self.log.error("Stopping cluster failed, assuming already dead.",
223 exc_info=True)
203 exc_info=True)
224 self.remove_pid_file()
204 self.remove_pid_file()
225
205
226 engine_aliases = {}
206 engine_aliases = {}
227 engine_aliases.update(base_aliases)
207 engine_aliases.update(base_aliases)
228 engine_aliases.update(dict(
208 engine_aliases.update(dict(
229 n='IPClusterEngines.n',
209 n='IPClusterEngines.n',
230 engines = 'IPClusterEngines.engine_launcher_class',
210 engines = 'IPClusterEngines.engine_launcher_class',
231 daemonize = 'IPClusterEngines.daemonize',
211 daemonize = 'IPClusterEngines.daemonize',
232 ))
212 ))
233 engine_flags = {}
213 engine_flags = {}
234 engine_flags.update(base_flags)
214 engine_flags.update(base_flags)
235
215
236 engine_flags.update(dict(
216 engine_flags.update(dict(
237 daemonize=(
217 daemonize=(
238 {'IPClusterEngines' : {'daemonize' : True}},
218 {'IPClusterEngines' : {'daemonize' : True}},
239 """run the cluster into the background (not available on Windows)""",
219 """run the cluster into the background (not available on Windows)""",
240 )
220 )
241 ))
221 ))
242 class IPClusterEngines(BaseParallelApplication):
222 class IPClusterEngines(BaseParallelApplication):
243
223
244 name = u'ipcluster'
224 name = u'ipcluster'
245 description = engines_help
225 description = engines_help
246 examples = _engines_examples
226 examples = _engines_examples
247 usage = None
227 usage = None
248 default_log_level = logging.INFO
228 default_log_level = logging.INFO
249 classes = List()
229 classes = List()
250 def _classes_default(self):
230 def _classes_default(self):
251 from IPython.parallel.apps import launcher
231 from IPython.parallel.apps import launcher
252 launchers = launcher.all_launchers
232 launchers = launcher.all_launchers
253 eslaunchers = [ l for l in launchers if 'EngineSet' in l.__name__]
233 eslaunchers = [ l for l in launchers if 'EngineSet' in l.__name__]
254 return [ProfileDir]+eslaunchers
234 return [ProfileDir]+eslaunchers
255
235
256 n = Integer(num_cpus(), config=True,
236 n = Integer(num_cpus(), config=True,
257 help="""The number of engines to start. The default is to use one for each
237 help="""The number of engines to start. The default is to use one for each
258 CPU on your machine""")
238 CPU on your machine""")
259
239
260 engine_launcher = Any(config=True, help="Deprecated, use engine_launcher_class")
240 engine_launcher = Any(config=True, help="Deprecated, use engine_launcher_class")
261 def _engine_launcher_changed(self, name, old, new):
241 def _engine_launcher_changed(self, name, old, new):
262 if isinstance(new, string_types):
242 if isinstance(new, string_types):
263 self.log.warn("WARNING: %s.engine_launcher is deprecated as of 0.12,"
243 self.log.warn("WARNING: %s.engine_launcher is deprecated as of 0.12,"
264 " use engine_launcher_class" % self.__class__.__name__)
244 " use engine_launcher_class" % self.__class__.__name__)
265 self.engine_launcher_class = new
245 self.engine_launcher_class = new
266 engine_launcher_class = DottedObjectName('LocalEngineSetLauncher',
246 engine_launcher_class = DottedObjectName('LocalEngineSetLauncher',
267 config=True,
247 config=True,
268 help="""The class for launching a set of Engines. Change this value
248 help="""The class for launching a set of Engines. Change this value
269 to use various batch systems to launch your engines, such as PBS,SGE,MPI,etc.
249 to use various batch systems to launch your engines, such as PBS,SGE,MPI,etc.
270 Each launcher class has its own set of configuration options, for making sure
250 Each launcher class has its own set of configuration options, for making sure
271 it will work in your environment.
251 it will work in your environment.
272
252
273 You can also write your own launcher, and specify it's absolute import path,
253 You can also write your own launcher, and specify it's absolute import path,
274 as in 'mymodule.launcher.FTLEnginesLauncher`.
254 as in 'mymodule.launcher.FTLEnginesLauncher`.
275
255
276 IPython's bundled examples include:
256 IPython's bundled examples include:
277
257
278 Local : start engines locally as subprocesses [default]
258 Local : start engines locally as subprocesses [default]
279 MPI : use mpiexec to launch engines in an MPI environment
259 MPI : use mpiexec to launch engines in an MPI environment
280 PBS : use PBS (qsub) to submit engines to a batch queue
260 PBS : use PBS (qsub) to submit engines to a batch queue
281 SGE : use SGE (qsub) to submit engines to a batch queue
261 SGE : use SGE (qsub) to submit engines to a batch queue
282 LSF : use LSF (bsub) to submit engines to a batch queue
262 LSF : use LSF (bsub) to submit engines to a batch queue
283 SSH : use SSH to start the controller
263 SSH : use SSH to start the controller
284 Note that SSH does *not* move the connection files
264 Note that SSH does *not* move the connection files
285 around, so you will likely have to do this manually
265 around, so you will likely have to do this manually
286 unless the machines are on a shared file system.
266 unless the machines are on a shared file system.
287 HTCondor : use HTCondor to submit engines to a batch queue
267 HTCondor : use HTCondor to submit engines to a batch queue
288 WindowsHPC : use Windows HPC
268 WindowsHPC : use Windows HPC
289
269
290 If you are using one of IPython's builtin launchers, you can specify just the
270 If you are using one of IPython's builtin launchers, you can specify just the
291 prefix, e.g:
271 prefix, e.g:
292
272
293 c.IPClusterEngines.engine_launcher_class = 'SSH'
273 c.IPClusterEngines.engine_launcher_class = 'SSH'
294
274
295 or:
275 or:
296
276
297 ipcluster start --engines=MPI
277 ipcluster start --engines=MPI
298
278
299 """
279 """
300 )
280 )
301 daemonize = Bool(False, config=True,
281 daemonize = Bool(False, config=True,
302 help="""Daemonize the ipcluster program. This implies --log-to-file.
282 help="""Daemonize the ipcluster program. This implies --log-to-file.
303 Not available on Windows.
283 Not available on Windows.
304 """)
284 """)
305
285
306 def _daemonize_changed(self, name, old, new):
286 def _daemonize_changed(self, name, old, new):
307 if new:
287 if new:
308 self.log_to_file = True
288 self.log_to_file = True
309
289
310 early_shutdown = Integer(30, config=True, help="The timeout (in seconds)")
290 early_shutdown = Integer(30, config=True, help="The timeout (in seconds)")
311 _stopping = False
291 _stopping = False
312
292
313 aliases = Dict(engine_aliases)
293 aliases = Dict(engine_aliases)
314 flags = Dict(engine_flags)
294 flags = Dict(engine_flags)
315
295
316 @catch_config_error
296 @catch_config_error
317 def initialize(self, argv=None):
297 def initialize(self, argv=None):
318 super(IPClusterEngines, self).initialize(argv)
298 super(IPClusterEngines, self).initialize(argv)
319 self.init_signal()
299 self.init_signal()
320 self.init_launchers()
300 self.init_launchers()
321
301
322 def init_launchers(self):
302 def init_launchers(self):
323 self.engine_launcher = self.build_launcher(self.engine_launcher_class, 'EngineSet')
303 self.engine_launcher = self.build_launcher(self.engine_launcher_class, 'EngineSet')
324
304
325 def init_signal(self):
305 def init_signal(self):
326 # Setup signals
306 # Setup signals
327 signal.signal(signal.SIGINT, self.sigint_handler)
307 signal.signal(signal.SIGINT, self.sigint_handler)
328
308
329 def build_launcher(self, clsname, kind=None):
309 def build_launcher(self, clsname, kind=None):
330 """import and instantiate a Launcher based on importstring"""
310 """import and instantiate a Launcher based on importstring"""
331 try:
311 try:
332 klass = find_launcher_class(clsname, kind)
312 klass = find_launcher_class(clsname, kind)
333 except (ImportError, KeyError):
313 except (ImportError, KeyError):
334 self.log.fatal("Could not import launcher class: %r"%clsname)
314 self.log.fatal("Could not import launcher class: %r"%clsname)
335 self.exit(1)
315 self.exit(1)
336
316
337 launcher = klass(
317 launcher = klass(
338 work_dir=u'.', parent=self, log=self.log,
318 work_dir=u'.', parent=self, log=self.log,
339 profile_dir=self.profile_dir.location, cluster_id=self.cluster_id,
319 profile_dir=self.profile_dir.location, cluster_id=self.cluster_id,
340 )
320 )
341 return launcher
321 return launcher
342
322
343 def engines_started_ok(self):
323 def engines_started_ok(self):
344 self.log.info("Engines appear to have started successfully")
324 self.log.info("Engines appear to have started successfully")
345 self.early_shutdown = 0
325 self.early_shutdown = 0
346
326
347 def start_engines(self):
327 def start_engines(self):
348 # Some EngineSetLaunchers ignore `n` and use their own engine count, such as SSH:
328 # Some EngineSetLaunchers ignore `n` and use their own engine count, such as SSH:
349 n = getattr(self.engine_launcher, 'engine_count', self.n)
329 n = getattr(self.engine_launcher, 'engine_count', self.n)
350 self.log.info("Starting %s Engines with %s", n, self.engine_launcher_class)
330 self.log.info("Starting %s Engines with %s", n, self.engine_launcher_class)
351 try:
331 try:
352 self.engine_launcher.start(self.n)
332 self.engine_launcher.start(self.n)
353 except:
333 except:
354 self.log.exception("Engine start failed")
334 self.log.exception("Engine start failed")
355 raise
335 raise
356 self.engine_launcher.on_stop(self.engines_stopped_early)
336 self.engine_launcher.on_stop(self.engines_stopped_early)
357 if self.early_shutdown:
337 if self.early_shutdown:
358 ioloop.DelayedCallback(self.engines_started_ok, self.early_shutdown*1000, self.loop).start()
338 self.loop.add_timeout(self.loop.time() + self.early_shutdown, self.engines_started_ok)
359
339
360 def engines_stopped_early(self, r):
340 def engines_stopped_early(self, r):
361 if self.early_shutdown and not self._stopping:
341 if self.early_shutdown and not self._stopping:
362 self.log.error("""
342 self.log.error("""
363 Engines shutdown early, they probably failed to connect.
343 Engines shutdown early, they probably failed to connect.
364
344
365 Check the engine log files for output.
345 Check the engine log files for output.
366
346
367 If your controller and engines are not on the same machine, you probably
347 If your controller and engines are not on the same machine, you probably
368 have to instruct the controller to listen on an interface other than localhost.
348 have to instruct the controller to listen on an interface other than localhost.
369
349
370 You can set this by adding "--ip='*'" to your ControllerLauncher.controller_args.
350 You can set this by adding "--ip='*'" to your ControllerLauncher.controller_args.
371
351
372 Be sure to read our security docs before instructing your controller to listen on
352 Be sure to read our security docs before instructing your controller to listen on
373 a public interface.
353 a public interface.
374 """)
354 """)
375 self.stop_launchers()
355 self.stop_launchers()
376
356
377 return self.engines_stopped(r)
357 return self.engines_stopped(r)
378
358
379 def engines_stopped(self, r):
359 def engines_stopped(self, r):
380 return self.loop.stop()
360 return self.loop.stop()
381
361
382 def stop_engines(self):
362 def stop_engines(self):
383 if self.engine_launcher.running:
363 if self.engine_launcher.running:
384 self.log.info("Stopping Engines...")
364 self.log.info("Stopping Engines...")
385 d = self.engine_launcher.stop()
365 d = self.engine_launcher.stop()
386 return d
366 return d
387 else:
367 else:
388 return None
368 return None
389
369
390 def stop_launchers(self, r=None):
370 def stop_launchers(self, r=None):
391 if not self._stopping:
371 if not self._stopping:
392 self._stopping = True
372 self._stopping = True
393 self.log.error("IPython cluster: stopping")
373 self.log.error("IPython cluster: stopping")
394 self.stop_engines()
374 self.stop_engines()
395 # Wait a few seconds to let things shut down.
375 # Wait a few seconds to let things shut down.
396 dc = ioloop.DelayedCallback(self.loop.stop, 3000, self.loop)
376 self.loop.add_timeout(self.loop.time() + 3, self.loop.stop)
397 dc.start()
398
377
399 def sigint_handler(self, signum, frame):
378 def sigint_handler(self, signum, frame):
400 self.log.debug("SIGINT received, stopping launchers...")
379 self.log.debug("SIGINT received, stopping launchers...")
401 self.stop_launchers()
380 self.stop_launchers()
402
381
403 def start_logging(self):
382 def start_logging(self):
404 # Remove old log files of the controller and engine
383 # Remove old log files of the controller and engine
405 if self.clean_logs:
384 if self.clean_logs:
406 log_dir = self.profile_dir.log_dir
385 log_dir = self.profile_dir.log_dir
407 for f in os.listdir(log_dir):
386 for f in os.listdir(log_dir):
408 if re.match(r'ip(engine|controller)-.+\.(log|err|out)',f):
387 if re.match(r'ip(engine|controller)-.+\.(log|err|out)',f):
409 os.remove(os.path.join(log_dir, f))
388 os.remove(os.path.join(log_dir, f))
410
389
411 def start(self):
390 def start(self):
412 """Start the app for the engines subcommand."""
391 """Start the app for the engines subcommand."""
413 self.log.info("IPython cluster: started")
392 self.log.info("IPython cluster: started")
414 # First see if the cluster is already running
393 # First see if the cluster is already running
415
394
416 # Now log and daemonize
395 # Now log and daemonize
417 self.log.info(
396 self.log.info(
418 'Starting engines with [daemon=%r]' % self.daemonize
397 'Starting engines with [daemon=%r]' % self.daemonize
419 )
398 )
420 # TODO: Get daemonize working on Windows or as a Windows Server.
399 # TODO: Get daemonize working on Windows or as a Windows Server.
421 if self.daemonize:
400 if self.daemonize:
422 if os.name=='posix':
401 if os.name=='posix':
423 daemonize()
402 daemonize()
424
403
425 dc = ioloop.DelayedCallback(self.start_engines, 0, self.loop)
404 self.loop.add_callback(self.start_engines)
426 dc.start()
427 # Now write the new pid file AFTER our new forked pid is active.
405 # Now write the new pid file AFTER our new forked pid is active.
428 # self.write_pid_file()
406 # self.write_pid_file()
429 try:
407 try:
430 self.loop.start()
408 self.loop.start()
431 except KeyboardInterrupt:
409 except KeyboardInterrupt:
432 pass
410 pass
433 except zmq.ZMQError as e:
411 except zmq.ZMQError as e:
434 if e.errno == errno.EINTR:
412 if e.errno == errno.EINTR:
435 pass
413 pass
436 else:
414 else:
437 raise
415 raise
438
416
439 start_aliases = {}
417 start_aliases = {}
440 start_aliases.update(engine_aliases)
418 start_aliases.update(engine_aliases)
441 start_aliases.update(dict(
419 start_aliases.update(dict(
442 delay='IPClusterStart.delay',
420 delay='IPClusterStart.delay',
443 controller = 'IPClusterStart.controller_launcher_class',
421 controller = 'IPClusterStart.controller_launcher_class',
444 ))
422 ))
445 start_aliases['clean-logs'] = 'IPClusterStart.clean_logs'
423 start_aliases['clean-logs'] = 'IPClusterStart.clean_logs'
446
424
447 class IPClusterStart(IPClusterEngines):
425 class IPClusterStart(IPClusterEngines):
448
426
449 name = u'ipcluster'
427 name = u'ipcluster'
450 description = start_help
428 description = start_help
451 examples = _start_examples
429 examples = _start_examples
452 default_log_level = logging.INFO
430 default_log_level = logging.INFO
453 auto_create = Bool(True, config=True,
431 auto_create = Bool(True, config=True,
454 help="whether to create the profile_dir if it doesn't exist")
432 help="whether to create the profile_dir if it doesn't exist")
455 classes = List()
433 classes = List()
456 def _classes_default(self,):
434 def _classes_default(self,):
457 from IPython.parallel.apps import launcher
435 from IPython.parallel.apps import launcher
458 return [ProfileDir] + [IPClusterEngines] + launcher.all_launchers
436 return [ProfileDir] + [IPClusterEngines] + launcher.all_launchers
459
437
460 clean_logs = Bool(True, config=True,
438 clean_logs = Bool(True, config=True,
461 help="whether to cleanup old logs before starting")
439 help="whether to cleanup old logs before starting")
462
440
463 delay = CFloat(1., config=True,
441 delay = CFloat(1., config=True,
464 help="delay (in s) between starting the controller and the engines")
442 help="delay (in s) between starting the controller and the engines")
465
443
466 controller_launcher = Any(config=True, help="Deprecated, use controller_launcher_class")
444 controller_launcher = Any(config=True, help="Deprecated, use controller_launcher_class")
467 def _controller_launcher_changed(self, name, old, new):
445 def _controller_launcher_changed(self, name, old, new):
468 if isinstance(new, string_types):
446 if isinstance(new, string_types):
469 # old 0.11-style config
447 # old 0.11-style config
470 self.log.warn("WARNING: %s.controller_launcher is deprecated as of 0.12,"
448 self.log.warn("WARNING: %s.controller_launcher is deprecated as of 0.12,"
471 " use controller_launcher_class" % self.__class__.__name__)
449 " use controller_launcher_class" % self.__class__.__name__)
472 self.controller_launcher_class = new
450 self.controller_launcher_class = new
473 controller_launcher_class = DottedObjectName('LocalControllerLauncher',
451 controller_launcher_class = DottedObjectName('LocalControllerLauncher',
474 config=True,
452 config=True,
475 help="""The class for launching a Controller. Change this value if you want
453 help="""The class for launching a Controller. Change this value if you want
476 your controller to also be launched by a batch system, such as PBS,SGE,MPI,etc.
454 your controller to also be launched by a batch system, such as PBS,SGE,MPI,etc.
477
455
478 Each launcher class has its own set of configuration options, for making sure
456 Each launcher class has its own set of configuration options, for making sure
479 it will work in your environment.
457 it will work in your environment.
480
458
481 Note that using a batch launcher for the controller *does not* put it
459 Note that using a batch launcher for the controller *does not* put it
482 in the same batch job as the engines, so they will still start separately.
460 in the same batch job as the engines, so they will still start separately.
483
461
484 IPython's bundled examples include:
462 IPython's bundled examples include:
485
463
486 Local : start engines locally as subprocesses
464 Local : start engines locally as subprocesses
487 MPI : use mpiexec to launch the controller in an MPI universe
465 MPI : use mpiexec to launch the controller in an MPI universe
488 PBS : use PBS (qsub) to submit the controller to a batch queue
466 PBS : use PBS (qsub) to submit the controller to a batch queue
489 SGE : use SGE (qsub) to submit the controller to a batch queue
467 SGE : use SGE (qsub) to submit the controller to a batch queue
490 LSF : use LSF (bsub) to submit the controller to a batch queue
468 LSF : use LSF (bsub) to submit the controller to a batch queue
491 HTCondor : use HTCondor to submit the controller to a batch queue
469 HTCondor : use HTCondor to submit the controller to a batch queue
492 SSH : use SSH to start the controller
470 SSH : use SSH to start the controller
493 WindowsHPC : use Windows HPC
471 WindowsHPC : use Windows HPC
494
472
495 If you are using one of IPython's builtin launchers, you can specify just the
473 If you are using one of IPython's builtin launchers, you can specify just the
496 prefix, e.g:
474 prefix, e.g:
497
475
498 c.IPClusterStart.controller_launcher_class = 'SSH'
476 c.IPClusterStart.controller_launcher_class = 'SSH'
499
477
500 or:
478 or:
501
479
502 ipcluster start --controller=MPI
480 ipcluster start --controller=MPI
503
481
504 """
482 """
505 )
483 )
506 reset = Bool(False, config=True,
484 reset = Bool(False, config=True,
507 help="Whether to reset config files as part of '--create'."
485 help="Whether to reset config files as part of '--create'."
508 )
486 )
509
487
510 # flags = Dict(flags)
488 # flags = Dict(flags)
511 aliases = Dict(start_aliases)
489 aliases = Dict(start_aliases)
512
490
513 def init_launchers(self):
491 def init_launchers(self):
514 self.controller_launcher = self.build_launcher(self.controller_launcher_class, 'Controller')
492 self.controller_launcher = self.build_launcher(self.controller_launcher_class, 'Controller')
515 self.engine_launcher = self.build_launcher(self.engine_launcher_class, 'EngineSet')
493 self.engine_launcher = self.build_launcher(self.engine_launcher_class, 'EngineSet')
516
494
517 def engines_stopped(self, r):
495 def engines_stopped(self, r):
518 """prevent parent.engines_stopped from stopping everything on engine shutdown"""
496 """prevent parent.engines_stopped from stopping everything on engine shutdown"""
519 pass
497 pass
520
498
521 def start_controller(self):
499 def start_controller(self):
522 self.log.info("Starting Controller with %s", self.controller_launcher_class)
500 self.log.info("Starting Controller with %s", self.controller_launcher_class)
523 self.controller_launcher.on_stop(self.stop_launchers)
501 self.controller_launcher.on_stop(self.stop_launchers)
524 try:
502 try:
525 self.controller_launcher.start()
503 self.controller_launcher.start()
526 except:
504 except:
527 self.log.exception("Controller start failed")
505 self.log.exception("Controller start failed")
528 raise
506 raise
529
507
530 def stop_controller(self):
508 def stop_controller(self):
531 # self.log.info("In stop_controller")
509 # self.log.info("In stop_controller")
532 if self.controller_launcher and self.controller_launcher.running:
510 if self.controller_launcher and self.controller_launcher.running:
533 return self.controller_launcher.stop()
511 return self.controller_launcher.stop()
534
512
535 def stop_launchers(self, r=None):
513 def stop_launchers(self, r=None):
536 if not self._stopping:
514 if not self._stopping:
537 self.stop_controller()
515 self.stop_controller()
538 super(IPClusterStart, self).stop_launchers()
516 super(IPClusterStart, self).stop_launchers()
539
517
540 def start(self):
518 def start(self):
541 """Start the app for the start subcommand."""
519 """Start the app for the start subcommand."""
542 # First see if the cluster is already running
520 # First see if the cluster is already running
543 try:
521 try:
544 pid = self.get_pid_from_file()
522 pid = self.get_pid_from_file()
545 except PIDFileError:
523 except PIDFileError:
546 pass
524 pass
547 else:
525 else:
548 if self.check_pid(pid):
526 if self.check_pid(pid):
549 self.log.critical(
527 self.log.critical(
550 'Cluster is already running with [pid=%s]. '
528 'Cluster is already running with [pid=%s]. '
551 'use "ipcluster stop" to stop the cluster.' % pid
529 'use "ipcluster stop" to stop the cluster.' % pid
552 )
530 )
553 # Here I exit with a unusual exit status that other processes
531 # Here I exit with a unusual exit status that other processes
554 # can watch for to learn how I existed.
532 # can watch for to learn how I existed.
555 self.exit(ALREADY_STARTED)
533 self.exit(ALREADY_STARTED)
556 else:
534 else:
557 self.remove_pid_file()
535 self.remove_pid_file()
558
536
559
537
560 # Now log and daemonize
538 # Now log and daemonize
561 self.log.info(
539 self.log.info(
562 'Starting ipcluster with [daemon=%r]' % self.daemonize
540 'Starting ipcluster with [daemon=%r]' % self.daemonize
563 )
541 )
564 # TODO: Get daemonize working on Windows or as a Windows Server.
542 # TODO: Get daemonize working on Windows or as a Windows Server.
565 if self.daemonize:
543 if self.daemonize:
566 if os.name=='posix':
544 if os.name=='posix':
567 daemonize()
545 daemonize()
568
546
569 dc = ioloop.DelayedCallback(self.start_controller, 0, self.loop)
547 def start():
570 dc.start()
548 self.start_controller()
571 dc = ioloop.DelayedCallback(self.start_engines, 1000*self.delay, self.loop)
549 self.loop.add_timeout(self.loop.time() + self.delay, self.start_engines)
572 dc.start()
550 self.loop.add_callback(start)
573 # Now write the new pid file AFTER our new forked pid is active.
551 # Now write the new pid file AFTER our new forked pid is active.
574 self.write_pid_file()
552 self.write_pid_file()
575 try:
553 try:
576 self.loop.start()
554 self.loop.start()
577 except KeyboardInterrupt:
555 except KeyboardInterrupt:
578 pass
556 pass
579 except zmq.ZMQError as e:
557 except zmq.ZMQError as e:
580 if e.errno == errno.EINTR:
558 if e.errno == errno.EINTR:
581 pass
559 pass
582 else:
560 else:
583 raise
561 raise
584 finally:
562 finally:
585 self.remove_pid_file()
563 self.remove_pid_file()
586
564
587 base='IPython.parallel.apps.ipclusterapp.IPCluster'
565 base='IPython.parallel.apps.ipclusterapp.IPCluster'
588
566
589 class IPClusterApp(BaseIPythonApplication):
567 class IPClusterApp(BaseIPythonApplication):
590 name = u'ipcluster'
568 name = u'ipcluster'
591 description = _description
569 description = _description
592 examples = _main_examples
570 examples = _main_examples
593
571
594 subcommands = {
572 subcommands = {
595 'start' : (base+'Start', start_help),
573 'start' : (base+'Start', start_help),
596 'stop' : (base+'Stop', stop_help),
574 'stop' : (base+'Stop', stop_help),
597 'engines' : (base+'Engines', engines_help),
575 'engines' : (base+'Engines', engines_help),
598 }
576 }
599
577
600 # no aliases or flags for parent App
578 # no aliases or flags for parent App
601 aliases = Dict()
579 aliases = Dict()
602 flags = Dict()
580 flags = Dict()
603
581
604 def start(self):
582 def start(self):
605 if self.subapp is None:
583 if self.subapp is None:
606 print("No subcommand specified. Must specify one of: %s"%(self.subcommands.keys()))
584 print("No subcommand specified. Must specify one of: %s"%(self.subcommands.keys()))
607 print()
585 print()
608 self.print_description()
586 self.print_description()
609 self.print_subcommands()
587 self.print_subcommands()
610 self.exit(1)
588 self.exit(1)
611 else:
589 else:
612 return self.subapp.start()
590 return self.subapp.start()
613
591
614 launch_new_instance = IPClusterApp.launch_instance
592 launch_new_instance = IPClusterApp.launch_instance
615
593
616 if __name__ == '__main__':
594 if __name__ == '__main__':
617 launch_new_instance()
595 launch_new_instance()
618
596
@@ -1,1447 +1,1446 b''
1 # encoding: utf-8
1 # encoding: utf-8
2 """Facilities for launching IPython processes asynchronously."""
2 """Facilities for launching IPython processes asynchronously."""
3
3
4 # Copyright (c) IPython Development Team.
4 # Copyright (c) IPython Development Team.
5 # Distributed under the terms of the Modified BSD License.
5 # Distributed under the terms of the Modified BSD License.
6
6
7 import copy
7 import copy
8 import logging
8 import logging
9 import os
9 import os
10 import pipes
10 import pipes
11 import stat
11 import stat
12 import sys
12 import sys
13 import time
13 import time
14
14
15 # signal imports, handling various platforms, versions
15 # signal imports, handling various platforms, versions
16
16
17 from signal import SIGINT, SIGTERM
17 from signal import SIGINT, SIGTERM
18 try:
18 try:
19 from signal import SIGKILL
19 from signal import SIGKILL
20 except ImportError:
20 except ImportError:
21 # Windows
21 # Windows
22 SIGKILL=SIGTERM
22 SIGKILL=SIGTERM
23
23
24 try:
24 try:
25 # Windows >= 2.7, 3.2
25 # Windows >= 2.7, 3.2
26 from signal import CTRL_C_EVENT as SIGINT
26 from signal import CTRL_C_EVENT as SIGINT
27 except ImportError:
27 except ImportError:
28 pass
28 pass
29
29
30 from subprocess import Popen, PIPE, STDOUT
30 from subprocess import Popen, PIPE, STDOUT
31 try:
31 try:
32 from subprocess import check_output
32 from subprocess import check_output
33 except ImportError:
33 except ImportError:
34 # pre-2.7, define check_output with Popen
34 # pre-2.7, define check_output with Popen
35 def check_output(*args, **kwargs):
35 def check_output(*args, **kwargs):
36 kwargs.update(dict(stdout=PIPE))
36 kwargs.update(dict(stdout=PIPE))
37 p = Popen(*args, **kwargs)
37 p = Popen(*args, **kwargs)
38 out,err = p.communicate()
38 out,err = p.communicate()
39 return out
39 return out
40
40
41 from zmq.eventloop import ioloop
41 from zmq.eventloop import ioloop
42
42
43 from IPython.config.application import Application
43 from IPython.config.application import Application
44 from IPython.config.configurable import LoggingConfigurable
44 from IPython.config.configurable import LoggingConfigurable
45 from IPython.utils.text import EvalFormatter
45 from IPython.utils.text import EvalFormatter
46 from IPython.utils.traitlets import (
46 from IPython.utils.traitlets import (
47 Any, Integer, CFloat, List, Unicode, Dict, Instance, HasTraits, CRegExp
47 Any, Integer, CFloat, List, Unicode, Dict, Instance, HasTraits, CRegExp
48 )
48 )
49 from IPython.utils.encoding import DEFAULT_ENCODING
49 from IPython.utils.encoding import DEFAULT_ENCODING
50 from IPython.utils.path import get_home_dir, ensure_dir_exists
50 from IPython.utils.path import get_home_dir, ensure_dir_exists
51 from IPython.utils.process import find_cmd, FindCmdError
51 from IPython.utils.process import find_cmd, FindCmdError
52 from IPython.utils.py3compat import iteritems, itervalues
52 from IPython.utils.py3compat import iteritems, itervalues
53
53
54 from .win32support import forward_read_events
54 from .win32support import forward_read_events
55
55
56 from .winhpcjob import IPControllerTask, IPEngineTask, IPControllerJob, IPEngineSetJob
56 from .winhpcjob import IPControllerTask, IPEngineTask, IPControllerJob, IPEngineSetJob
57
57
58 WINDOWS = os.name == 'nt'
58 WINDOWS = os.name == 'nt'
59
59
60 #-----------------------------------------------------------------------------
60 #-----------------------------------------------------------------------------
61 # Paths to the kernel apps
61 # Paths to the kernel apps
62 #-----------------------------------------------------------------------------
62 #-----------------------------------------------------------------------------
63
63
64 ipcluster_cmd_argv = [sys.executable, "-m", "IPython.parallel.cluster"]
64 ipcluster_cmd_argv = [sys.executable, "-m", "IPython.parallel.cluster"]
65
65
66 ipengine_cmd_argv = [sys.executable, "-m", "IPython.parallel.engine"]
66 ipengine_cmd_argv = [sys.executable, "-m", "IPython.parallel.engine"]
67
67
68 ipcontroller_cmd_argv = [sys.executable, "-m", "IPython.parallel.controller"]
68 ipcontroller_cmd_argv = [sys.executable, "-m", "IPython.parallel.controller"]
69
69
70 if WINDOWS and sys.version_info < (3,):
70 if WINDOWS and sys.version_info < (3,):
71 # `python -m package` doesn't work on Windows Python 2,
71 # `python -m package` doesn't work on Windows Python 2,
72 # but `python -m module` does.
72 # but `python -m module` does.
73 ipengine_cmd_argv = [sys.executable, "-m", "IPython.parallel.apps.ipengineapp"]
73 ipengine_cmd_argv = [sys.executable, "-m", "IPython.parallel.apps.ipengineapp"]
74 ipcontroller_cmd_argv = [sys.executable, "-m", "IPython.parallel.apps.ipcontrollerapp"]
74 ipcontroller_cmd_argv = [sys.executable, "-m", "IPython.parallel.apps.ipcontrollerapp"]
75
75
76 #-----------------------------------------------------------------------------
76 #-----------------------------------------------------------------------------
77 # Base launchers and errors
77 # Base launchers and errors
78 #-----------------------------------------------------------------------------
78 #-----------------------------------------------------------------------------
79
79
80 class LauncherError(Exception):
80 class LauncherError(Exception):
81 pass
81 pass
82
82
83
83
84 class ProcessStateError(LauncherError):
84 class ProcessStateError(LauncherError):
85 pass
85 pass
86
86
87
87
88 class UnknownStatus(LauncherError):
88 class UnknownStatus(LauncherError):
89 pass
89 pass
90
90
91
91
92 class BaseLauncher(LoggingConfigurable):
92 class BaseLauncher(LoggingConfigurable):
93 """An asbtraction for starting, stopping and signaling a process."""
93 """An asbtraction for starting, stopping and signaling a process."""
94
94
95 # In all of the launchers, the work_dir is where child processes will be
95 # In all of the launchers, the work_dir is where child processes will be
96 # run. This will usually be the profile_dir, but may not be. any work_dir
96 # run. This will usually be the profile_dir, but may not be. any work_dir
97 # passed into the __init__ method will override the config value.
97 # passed into the __init__ method will override the config value.
98 # This should not be used to set the work_dir for the actual engine
98 # This should not be used to set the work_dir for the actual engine
99 # and controller. Instead, use their own config files or the
99 # and controller. Instead, use their own config files or the
100 # controller_args, engine_args attributes of the launchers to add
100 # controller_args, engine_args attributes of the launchers to add
101 # the work_dir option.
101 # the work_dir option.
102 work_dir = Unicode(u'.')
102 work_dir = Unicode(u'.')
103 loop = Instance('zmq.eventloop.ioloop.IOLoop')
103 loop = Instance('zmq.eventloop.ioloop.IOLoop')
104
104
105 start_data = Any()
105 start_data = Any()
106 stop_data = Any()
106 stop_data = Any()
107
107
108 def _loop_default(self):
108 def _loop_default(self):
109 return ioloop.IOLoop.instance()
109 return ioloop.IOLoop.instance()
110
110
111 def __init__(self, work_dir=u'.', config=None, **kwargs):
111 def __init__(self, work_dir=u'.', config=None, **kwargs):
112 super(BaseLauncher, self).__init__(work_dir=work_dir, config=config, **kwargs)
112 super(BaseLauncher, self).__init__(work_dir=work_dir, config=config, **kwargs)
113 self.state = 'before' # can be before, running, after
113 self.state = 'before' # can be before, running, after
114 self.stop_callbacks = []
114 self.stop_callbacks = []
115 self.start_data = None
115 self.start_data = None
116 self.stop_data = None
116 self.stop_data = None
117
117
118 @property
118 @property
119 def args(self):
119 def args(self):
120 """A list of cmd and args that will be used to start the process.
120 """A list of cmd and args that will be used to start the process.
121
121
122 This is what is passed to :func:`spawnProcess` and the first element
122 This is what is passed to :func:`spawnProcess` and the first element
123 will be the process name.
123 will be the process name.
124 """
124 """
125 return self.find_args()
125 return self.find_args()
126
126
127 def find_args(self):
127 def find_args(self):
128 """The ``.args`` property calls this to find the args list.
128 """The ``.args`` property calls this to find the args list.
129
129
130 Subcommand should implement this to construct the cmd and args.
130 Subcommand should implement this to construct the cmd and args.
131 """
131 """
132 raise NotImplementedError('find_args must be implemented in a subclass')
132 raise NotImplementedError('find_args must be implemented in a subclass')
133
133
134 @property
134 @property
135 def arg_str(self):
135 def arg_str(self):
136 """The string form of the program arguments."""
136 """The string form of the program arguments."""
137 return ' '.join(self.args)
137 return ' '.join(self.args)
138
138
139 @property
139 @property
140 def running(self):
140 def running(self):
141 """Am I running."""
141 """Am I running."""
142 if self.state == 'running':
142 if self.state == 'running':
143 return True
143 return True
144 else:
144 else:
145 return False
145 return False
146
146
147 def start(self):
147 def start(self):
148 """Start the process."""
148 """Start the process."""
149 raise NotImplementedError('start must be implemented in a subclass')
149 raise NotImplementedError('start must be implemented in a subclass')
150
150
151 def stop(self):
151 def stop(self):
152 """Stop the process and notify observers of stopping.
152 """Stop the process and notify observers of stopping.
153
153
154 This method will return None immediately.
154 This method will return None immediately.
155 To observe the actual process stopping, see :meth:`on_stop`.
155 To observe the actual process stopping, see :meth:`on_stop`.
156 """
156 """
157 raise NotImplementedError('stop must be implemented in a subclass')
157 raise NotImplementedError('stop must be implemented in a subclass')
158
158
159 def on_stop(self, f):
159 def on_stop(self, f):
160 """Register a callback to be called with this Launcher's stop_data
160 """Register a callback to be called with this Launcher's stop_data
161 when the process actually finishes.
161 when the process actually finishes.
162 """
162 """
163 if self.state=='after':
163 if self.state=='after':
164 return f(self.stop_data)
164 return f(self.stop_data)
165 else:
165 else:
166 self.stop_callbacks.append(f)
166 self.stop_callbacks.append(f)
167
167
168 def notify_start(self, data):
168 def notify_start(self, data):
169 """Call this to trigger startup actions.
169 """Call this to trigger startup actions.
170
170
171 This logs the process startup and sets the state to 'running'. It is
171 This logs the process startup and sets the state to 'running'. It is
172 a pass-through so it can be used as a callback.
172 a pass-through so it can be used as a callback.
173 """
173 """
174
174
175 self.log.debug('Process %r started: %r', self.args[0], data)
175 self.log.debug('Process %r started: %r', self.args[0], data)
176 self.start_data = data
176 self.start_data = data
177 self.state = 'running'
177 self.state = 'running'
178 return data
178 return data
179
179
180 def notify_stop(self, data):
180 def notify_stop(self, data):
181 """Call this to trigger process stop actions.
181 """Call this to trigger process stop actions.
182
182
183 This logs the process stopping and sets the state to 'after'. Call
183 This logs the process stopping and sets the state to 'after'. Call
184 this to trigger callbacks registered via :meth:`on_stop`."""
184 this to trigger callbacks registered via :meth:`on_stop`."""
185
185
186 self.log.debug('Process %r stopped: %r', self.args[0], data)
186 self.log.debug('Process %r stopped: %r', self.args[0], data)
187 self.stop_data = data
187 self.stop_data = data
188 self.state = 'after'
188 self.state = 'after'
189 for i in range(len(self.stop_callbacks)):
189 for i in range(len(self.stop_callbacks)):
190 d = self.stop_callbacks.pop()
190 d = self.stop_callbacks.pop()
191 d(data)
191 d(data)
192 return data
192 return data
193
193
194 def signal(self, sig):
194 def signal(self, sig):
195 """Signal the process.
195 """Signal the process.
196
196
197 Parameters
197 Parameters
198 ----------
198 ----------
199 sig : str or int
199 sig : str or int
200 'KILL', 'INT', etc., or any signal number
200 'KILL', 'INT', etc., or any signal number
201 """
201 """
202 raise NotImplementedError('signal must be implemented in a subclass')
202 raise NotImplementedError('signal must be implemented in a subclass')
203
203
204 class ClusterAppMixin(HasTraits):
204 class ClusterAppMixin(HasTraits):
205 """MixIn for cluster args as traits"""
205 """MixIn for cluster args as traits"""
206 profile_dir=Unicode('')
206 profile_dir=Unicode('')
207 cluster_id=Unicode('')
207 cluster_id=Unicode('')
208
208
209 @property
209 @property
210 def cluster_args(self):
210 def cluster_args(self):
211 return ['--profile-dir', self.profile_dir, '--cluster-id', self.cluster_id]
211 return ['--profile-dir', self.profile_dir, '--cluster-id', self.cluster_id]
212
212
213 class ControllerMixin(ClusterAppMixin):
213 class ControllerMixin(ClusterAppMixin):
214 controller_cmd = List(ipcontroller_cmd_argv, config=True,
214 controller_cmd = List(ipcontroller_cmd_argv, config=True,
215 help="""Popen command to launch ipcontroller.""")
215 help="""Popen command to launch ipcontroller.""")
216 # Command line arguments to ipcontroller.
216 # Command line arguments to ipcontroller.
217 controller_args = List(['--log-to-file','--log-level=%i' % logging.INFO], config=True,
217 controller_args = List(['--log-to-file','--log-level=%i' % logging.INFO], config=True,
218 help="""command-line args to pass to ipcontroller""")
218 help="""command-line args to pass to ipcontroller""")
219
219
220 class EngineMixin(ClusterAppMixin):
220 class EngineMixin(ClusterAppMixin):
221 engine_cmd = List(ipengine_cmd_argv, config=True,
221 engine_cmd = List(ipengine_cmd_argv, config=True,
222 help="""command to launch the Engine.""")
222 help="""command to launch the Engine.""")
223 # Command line arguments for ipengine.
223 # Command line arguments for ipengine.
224 engine_args = List(['--log-to-file','--log-level=%i' % logging.INFO], config=True,
224 engine_args = List(['--log-to-file','--log-level=%i' % logging.INFO], config=True,
225 help="command-line arguments to pass to ipengine"
225 help="command-line arguments to pass to ipengine"
226 )
226 )
227
227
228
228
229 #-----------------------------------------------------------------------------
229 #-----------------------------------------------------------------------------
230 # Local process launchers
230 # Local process launchers
231 #-----------------------------------------------------------------------------
231 #-----------------------------------------------------------------------------
232
232
233
233
234 class LocalProcessLauncher(BaseLauncher):
234 class LocalProcessLauncher(BaseLauncher):
235 """Start and stop an external process in an asynchronous manner.
235 """Start and stop an external process in an asynchronous manner.
236
236
237 This will launch the external process with a working directory of
237 This will launch the external process with a working directory of
238 ``self.work_dir``.
238 ``self.work_dir``.
239 """
239 """
240
240
241 # This is used to to construct self.args, which is passed to
241 # This is used to to construct self.args, which is passed to
242 # spawnProcess.
242 # spawnProcess.
243 cmd_and_args = List([])
243 cmd_and_args = List([])
244 poll_frequency = Integer(100) # in ms
244 poll_frequency = Integer(100) # in ms
245
245
246 def __init__(self, work_dir=u'.', config=None, **kwargs):
246 def __init__(self, work_dir=u'.', config=None, **kwargs):
247 super(LocalProcessLauncher, self).__init__(
247 super(LocalProcessLauncher, self).__init__(
248 work_dir=work_dir, config=config, **kwargs
248 work_dir=work_dir, config=config, **kwargs
249 )
249 )
250 self.process = None
250 self.process = None
251 self.poller = None
251 self.poller = None
252
252
253 def find_args(self):
253 def find_args(self):
254 return self.cmd_and_args
254 return self.cmd_and_args
255
255
256 def start(self):
256 def start(self):
257 self.log.debug("Starting %s: %r", self.__class__.__name__, self.args)
257 self.log.debug("Starting %s: %r", self.__class__.__name__, self.args)
258 if self.state == 'before':
258 if self.state == 'before':
259 self.process = Popen(self.args,
259 self.process = Popen(self.args,
260 stdout=PIPE,stderr=PIPE,stdin=PIPE,
260 stdout=PIPE,stderr=PIPE,stdin=PIPE,
261 env=os.environ,
261 env=os.environ,
262 cwd=self.work_dir
262 cwd=self.work_dir
263 )
263 )
264 if WINDOWS:
264 if WINDOWS:
265 self.stdout = forward_read_events(self.process.stdout)
265 self.stdout = forward_read_events(self.process.stdout)
266 self.stderr = forward_read_events(self.process.stderr)
266 self.stderr = forward_read_events(self.process.stderr)
267 else:
267 else:
268 self.stdout = self.process.stdout.fileno()
268 self.stdout = self.process.stdout.fileno()
269 self.stderr = self.process.stderr.fileno()
269 self.stderr = self.process.stderr.fileno()
270 self.loop.add_handler(self.stdout, self.handle_stdout, self.loop.READ)
270 self.loop.add_handler(self.stdout, self.handle_stdout, self.loop.READ)
271 self.loop.add_handler(self.stderr, self.handle_stderr, self.loop.READ)
271 self.loop.add_handler(self.stderr, self.handle_stderr, self.loop.READ)
272 self.poller = ioloop.PeriodicCallback(self.poll, self.poll_frequency, self.loop)
272 self.poller = ioloop.PeriodicCallback(self.poll, self.poll_frequency, self.loop)
273 self.poller.start()
273 self.poller.start()
274 self.notify_start(self.process.pid)
274 self.notify_start(self.process.pid)
275 else:
275 else:
276 s = 'The process was already started and has state: %r' % self.state
276 s = 'The process was already started and has state: %r' % self.state
277 raise ProcessStateError(s)
277 raise ProcessStateError(s)
278
278
279 def stop(self):
279 def stop(self):
280 return self.interrupt_then_kill()
280 return self.interrupt_then_kill()
281
281
282 def signal(self, sig):
282 def signal(self, sig):
283 if self.state == 'running':
283 if self.state == 'running':
284 if WINDOWS and sig != SIGINT:
284 if WINDOWS and sig != SIGINT:
285 # use Windows tree-kill for better child cleanup
285 # use Windows tree-kill for better child cleanup
286 check_output(['taskkill', '-pid', str(self.process.pid), '-t', '-f'])
286 check_output(['taskkill', '-pid', str(self.process.pid), '-t', '-f'])
287 else:
287 else:
288 self.process.send_signal(sig)
288 self.process.send_signal(sig)
289
289
290 def interrupt_then_kill(self, delay=2.0):
290 def interrupt_then_kill(self, delay=2.0):
291 """Send INT, wait a delay and then send KILL."""
291 """Send INT, wait a delay and then send KILL."""
292 try:
292 try:
293 self.signal(SIGINT)
293 self.signal(SIGINT)
294 except Exception:
294 except Exception:
295 self.log.debug("interrupt failed")
295 self.log.debug("interrupt failed")
296 pass
296 pass
297 self.killer = ioloop.DelayedCallback(lambda : self.signal(SIGKILL), delay*1000, self.loop)
297 self.killer = self.loop.add_timeout(self.loop.time() + delay, lambda : self.signal(SIGKILL))
298 self.killer.start()
299
298
300 # callbacks, etc:
299 # callbacks, etc:
301
300
302 def handle_stdout(self, fd, events):
301 def handle_stdout(self, fd, events):
303 if WINDOWS:
302 if WINDOWS:
304 line = self.stdout.recv()
303 line = self.stdout.recv()
305 else:
304 else:
306 line = self.process.stdout.readline()
305 line = self.process.stdout.readline()
307 # a stopped process will be readable but return empty strings
306 # a stopped process will be readable but return empty strings
308 if line:
307 if line:
309 self.log.debug(line[:-1])
308 self.log.debug(line[:-1])
310 else:
309 else:
311 self.poll()
310 self.poll()
312
311
313 def handle_stderr(self, fd, events):
312 def handle_stderr(self, fd, events):
314 if WINDOWS:
313 if WINDOWS:
315 line = self.stderr.recv()
314 line = self.stderr.recv()
316 else:
315 else:
317 line = self.process.stderr.readline()
316 line = self.process.stderr.readline()
318 # a stopped process will be readable but return empty strings
317 # a stopped process will be readable but return empty strings
319 if line:
318 if line:
320 self.log.debug(line[:-1])
319 self.log.debug(line[:-1])
321 else:
320 else:
322 self.poll()
321 self.poll()
323
322
324 def poll(self):
323 def poll(self):
325 status = self.process.poll()
324 status = self.process.poll()
326 if status is not None:
325 if status is not None:
327 self.poller.stop()
326 self.poller.stop()
328 self.loop.remove_handler(self.stdout)
327 self.loop.remove_handler(self.stdout)
329 self.loop.remove_handler(self.stderr)
328 self.loop.remove_handler(self.stderr)
330 self.notify_stop(dict(exit_code=status, pid=self.process.pid))
329 self.notify_stop(dict(exit_code=status, pid=self.process.pid))
331 return status
330 return status
332
331
333 class LocalControllerLauncher(LocalProcessLauncher, ControllerMixin):
332 class LocalControllerLauncher(LocalProcessLauncher, ControllerMixin):
334 """Launch a controller as a regular external process."""
333 """Launch a controller as a regular external process."""
335
334
336 def find_args(self):
335 def find_args(self):
337 return self.controller_cmd + self.cluster_args + self.controller_args
336 return self.controller_cmd + self.cluster_args + self.controller_args
338
337
339 def start(self):
338 def start(self):
340 """Start the controller by profile_dir."""
339 """Start the controller by profile_dir."""
341 return super(LocalControllerLauncher, self).start()
340 return super(LocalControllerLauncher, self).start()
342
341
343
342
344 class LocalEngineLauncher(LocalProcessLauncher, EngineMixin):
343 class LocalEngineLauncher(LocalProcessLauncher, EngineMixin):
345 """Launch a single engine as a regular externall process."""
344 """Launch a single engine as a regular externall process."""
346
345
347 def find_args(self):
346 def find_args(self):
348 return self.engine_cmd + self.cluster_args + self.engine_args
347 return self.engine_cmd + self.cluster_args + self.engine_args
349
348
350
349
351 class LocalEngineSetLauncher(LocalEngineLauncher):
350 class LocalEngineSetLauncher(LocalEngineLauncher):
352 """Launch a set of engines as regular external processes."""
351 """Launch a set of engines as regular external processes."""
353
352
354 delay = CFloat(0.1, config=True,
353 delay = CFloat(0.1, config=True,
355 help="""delay (in seconds) between starting each engine after the first.
354 help="""delay (in seconds) between starting each engine after the first.
356 This can help force the engines to get their ids in order, or limit
355 This can help force the engines to get their ids in order, or limit
357 process flood when starting many engines."""
356 process flood when starting many engines."""
358 )
357 )
359
358
360 # launcher class
359 # launcher class
361 launcher_class = LocalEngineLauncher
360 launcher_class = LocalEngineLauncher
362
361
363 launchers = Dict()
362 launchers = Dict()
364 stop_data = Dict()
363 stop_data = Dict()
365
364
366 def __init__(self, work_dir=u'.', config=None, **kwargs):
365 def __init__(self, work_dir=u'.', config=None, **kwargs):
367 super(LocalEngineSetLauncher, self).__init__(
366 super(LocalEngineSetLauncher, self).__init__(
368 work_dir=work_dir, config=config, **kwargs
367 work_dir=work_dir, config=config, **kwargs
369 )
368 )
370 self.stop_data = {}
369 self.stop_data = {}
371
370
372 def start(self, n):
371 def start(self, n):
373 """Start n engines by profile or profile_dir."""
372 """Start n engines by profile or profile_dir."""
374 dlist = []
373 dlist = []
375 for i in range(n):
374 for i in range(n):
376 if i > 0:
375 if i > 0:
377 time.sleep(self.delay)
376 time.sleep(self.delay)
378 el = self.launcher_class(work_dir=self.work_dir, parent=self, log=self.log,
377 el = self.launcher_class(work_dir=self.work_dir, parent=self, log=self.log,
379 profile_dir=self.profile_dir, cluster_id=self.cluster_id,
378 profile_dir=self.profile_dir, cluster_id=self.cluster_id,
380 )
379 )
381
380
382 # Copy the engine args over to each engine launcher.
381 # Copy the engine args over to each engine launcher.
383 el.engine_cmd = copy.deepcopy(self.engine_cmd)
382 el.engine_cmd = copy.deepcopy(self.engine_cmd)
384 el.engine_args = copy.deepcopy(self.engine_args)
383 el.engine_args = copy.deepcopy(self.engine_args)
385 el.on_stop(self._notice_engine_stopped)
384 el.on_stop(self._notice_engine_stopped)
386 d = el.start()
385 d = el.start()
387 self.launchers[i] = el
386 self.launchers[i] = el
388 dlist.append(d)
387 dlist.append(d)
389 self.notify_start(dlist)
388 self.notify_start(dlist)
390 return dlist
389 return dlist
391
390
392 def find_args(self):
391 def find_args(self):
393 return ['engine set']
392 return ['engine set']
394
393
395 def signal(self, sig):
394 def signal(self, sig):
396 dlist = []
395 dlist = []
397 for el in itervalues(self.launchers):
396 for el in itervalues(self.launchers):
398 d = el.signal(sig)
397 d = el.signal(sig)
399 dlist.append(d)
398 dlist.append(d)
400 return dlist
399 return dlist
401
400
402 def interrupt_then_kill(self, delay=1.0):
401 def interrupt_then_kill(self, delay=1.0):
403 dlist = []
402 dlist = []
404 for el in itervalues(self.launchers):
403 for el in itervalues(self.launchers):
405 d = el.interrupt_then_kill(delay)
404 d = el.interrupt_then_kill(delay)
406 dlist.append(d)
405 dlist.append(d)
407 return dlist
406 return dlist
408
407
409 def stop(self):
408 def stop(self):
410 return self.interrupt_then_kill()
409 return self.interrupt_then_kill()
411
410
412 def _notice_engine_stopped(self, data):
411 def _notice_engine_stopped(self, data):
413 pid = data['pid']
412 pid = data['pid']
414 for idx,el in iteritems(self.launchers):
413 for idx,el in iteritems(self.launchers):
415 if el.process.pid == pid:
414 if el.process.pid == pid:
416 break
415 break
417 self.launchers.pop(idx)
416 self.launchers.pop(idx)
418 self.stop_data[idx] = data
417 self.stop_data[idx] = data
419 if not self.launchers:
418 if not self.launchers:
420 self.notify_stop(self.stop_data)
419 self.notify_stop(self.stop_data)
421
420
422
421
423 #-----------------------------------------------------------------------------
422 #-----------------------------------------------------------------------------
424 # MPI launchers
423 # MPI launchers
425 #-----------------------------------------------------------------------------
424 #-----------------------------------------------------------------------------
426
425
427
426
428 class MPILauncher(LocalProcessLauncher):
427 class MPILauncher(LocalProcessLauncher):
429 """Launch an external process using mpiexec."""
428 """Launch an external process using mpiexec."""
430
429
431 mpi_cmd = List(['mpiexec'], config=True,
430 mpi_cmd = List(['mpiexec'], config=True,
432 help="The mpiexec command to use in starting the process."
431 help="The mpiexec command to use in starting the process."
433 )
432 )
434 mpi_args = List([], config=True,
433 mpi_args = List([], config=True,
435 help="The command line arguments to pass to mpiexec."
434 help="The command line arguments to pass to mpiexec."
436 )
435 )
437 program = List(['date'],
436 program = List(['date'],
438 help="The program to start via mpiexec.")
437 help="The program to start via mpiexec.")
439 program_args = List([],
438 program_args = List([],
440 help="The command line argument to the program."
439 help="The command line argument to the program."
441 )
440 )
442 n = Integer(1)
441 n = Integer(1)
443
442
444 def __init__(self, *args, **kwargs):
443 def __init__(self, *args, **kwargs):
445 # deprecation for old MPIExec names:
444 # deprecation for old MPIExec names:
446 config = kwargs.get('config', {})
445 config = kwargs.get('config', {})
447 for oldname in ('MPIExecLauncher', 'MPIExecControllerLauncher', 'MPIExecEngineSetLauncher'):
446 for oldname in ('MPIExecLauncher', 'MPIExecControllerLauncher', 'MPIExecEngineSetLauncher'):
448 deprecated = config.get(oldname)
447 deprecated = config.get(oldname)
449 if deprecated:
448 if deprecated:
450 newname = oldname.replace('MPIExec', 'MPI')
449 newname = oldname.replace('MPIExec', 'MPI')
451 config[newname].update(deprecated)
450 config[newname].update(deprecated)
452 self.log.warn("WARNING: %s name has been deprecated, use %s", oldname, newname)
451 self.log.warn("WARNING: %s name has been deprecated, use %s", oldname, newname)
453
452
454 super(MPILauncher, self).__init__(*args, **kwargs)
453 super(MPILauncher, self).__init__(*args, **kwargs)
455
454
456 def find_args(self):
455 def find_args(self):
457 """Build self.args using all the fields."""
456 """Build self.args using all the fields."""
458 return self.mpi_cmd + ['-n', str(self.n)] + self.mpi_args + \
457 return self.mpi_cmd + ['-n', str(self.n)] + self.mpi_args + \
459 self.program + self.program_args
458 self.program + self.program_args
460
459
461 def start(self, n):
460 def start(self, n):
462 """Start n instances of the program using mpiexec."""
461 """Start n instances of the program using mpiexec."""
463 self.n = n
462 self.n = n
464 return super(MPILauncher, self).start()
463 return super(MPILauncher, self).start()
465
464
466
465
467 class MPIControllerLauncher(MPILauncher, ControllerMixin):
466 class MPIControllerLauncher(MPILauncher, ControllerMixin):
468 """Launch a controller using mpiexec."""
467 """Launch a controller using mpiexec."""
469
468
470 # alias back to *non-configurable* program[_args] for use in find_args()
469 # alias back to *non-configurable* program[_args] for use in find_args()
471 # this way all Controller/EngineSetLaunchers have the same form, rather
470 # this way all Controller/EngineSetLaunchers have the same form, rather
472 # than *some* having `program_args` and others `controller_args`
471 # than *some* having `program_args` and others `controller_args`
473 @property
472 @property
474 def program(self):
473 def program(self):
475 return self.controller_cmd
474 return self.controller_cmd
476
475
477 @property
476 @property
478 def program_args(self):
477 def program_args(self):
479 return self.cluster_args + self.controller_args
478 return self.cluster_args + self.controller_args
480
479
481 def start(self):
480 def start(self):
482 """Start the controller by profile_dir."""
481 """Start the controller by profile_dir."""
483 return super(MPIControllerLauncher, self).start(1)
482 return super(MPIControllerLauncher, self).start(1)
484
483
485
484
486 class MPIEngineSetLauncher(MPILauncher, EngineMixin):
485 class MPIEngineSetLauncher(MPILauncher, EngineMixin):
487 """Launch engines using mpiexec"""
486 """Launch engines using mpiexec"""
488
487
489 # alias back to *non-configurable* program[_args] for use in find_args()
488 # alias back to *non-configurable* program[_args] for use in find_args()
490 # this way all Controller/EngineSetLaunchers have the same form, rather
489 # this way all Controller/EngineSetLaunchers have the same form, rather
491 # than *some* having `program_args` and others `controller_args`
490 # than *some* having `program_args` and others `controller_args`
492 @property
491 @property
493 def program(self):
492 def program(self):
494 return self.engine_cmd
493 return self.engine_cmd
495
494
496 @property
495 @property
497 def program_args(self):
496 def program_args(self):
498 return self.cluster_args + self.engine_args
497 return self.cluster_args + self.engine_args
499
498
500 def start(self, n):
499 def start(self, n):
501 """Start n engines by profile or profile_dir."""
500 """Start n engines by profile or profile_dir."""
502 self.n = n
501 self.n = n
503 return super(MPIEngineSetLauncher, self).start(n)
502 return super(MPIEngineSetLauncher, self).start(n)
504
503
505 # deprecated MPIExec names
504 # deprecated MPIExec names
506 class DeprecatedMPILauncher(object):
505 class DeprecatedMPILauncher(object):
507 def warn(self):
506 def warn(self):
508 oldname = self.__class__.__name__
507 oldname = self.__class__.__name__
509 newname = oldname.replace('MPIExec', 'MPI')
508 newname = oldname.replace('MPIExec', 'MPI')
510 self.log.warn("WARNING: %s name is deprecated, use %s", oldname, newname)
509 self.log.warn("WARNING: %s name is deprecated, use %s", oldname, newname)
511
510
512 class MPIExecLauncher(MPILauncher, DeprecatedMPILauncher):
511 class MPIExecLauncher(MPILauncher, DeprecatedMPILauncher):
513 """Deprecated, use MPILauncher"""
512 """Deprecated, use MPILauncher"""
514 def __init__(self, *args, **kwargs):
513 def __init__(self, *args, **kwargs):
515 super(MPIExecLauncher, self).__init__(*args, **kwargs)
514 super(MPIExecLauncher, self).__init__(*args, **kwargs)
516 self.warn()
515 self.warn()
517
516
518 class MPIExecControllerLauncher(MPIControllerLauncher, DeprecatedMPILauncher):
517 class MPIExecControllerLauncher(MPIControllerLauncher, DeprecatedMPILauncher):
519 """Deprecated, use MPIControllerLauncher"""
518 """Deprecated, use MPIControllerLauncher"""
520 def __init__(self, *args, **kwargs):
519 def __init__(self, *args, **kwargs):
521 super(MPIExecControllerLauncher, self).__init__(*args, **kwargs)
520 super(MPIExecControllerLauncher, self).__init__(*args, **kwargs)
522 self.warn()
521 self.warn()
523
522
524 class MPIExecEngineSetLauncher(MPIEngineSetLauncher, DeprecatedMPILauncher):
523 class MPIExecEngineSetLauncher(MPIEngineSetLauncher, DeprecatedMPILauncher):
525 """Deprecated, use MPIEngineSetLauncher"""
524 """Deprecated, use MPIEngineSetLauncher"""
526 def __init__(self, *args, **kwargs):
525 def __init__(self, *args, **kwargs):
527 super(MPIExecEngineSetLauncher, self).__init__(*args, **kwargs)
526 super(MPIExecEngineSetLauncher, self).__init__(*args, **kwargs)
528 self.warn()
527 self.warn()
529
528
530
529
531 #-----------------------------------------------------------------------------
530 #-----------------------------------------------------------------------------
532 # SSH launchers
531 # SSH launchers
533 #-----------------------------------------------------------------------------
532 #-----------------------------------------------------------------------------
534
533
535 # TODO: Get SSH Launcher back to level of sshx in 0.10.2
534 # TODO: Get SSH Launcher back to level of sshx in 0.10.2
536
535
537 class SSHLauncher(LocalProcessLauncher):
536 class SSHLauncher(LocalProcessLauncher):
538 """A minimal launcher for ssh.
537 """A minimal launcher for ssh.
539
538
540 To be useful this will probably have to be extended to use the ``sshx``
539 To be useful this will probably have to be extended to use the ``sshx``
541 idea for environment variables. There could be other things this needs
540 idea for environment variables. There could be other things this needs
542 as well.
541 as well.
543 """
542 """
544
543
545 ssh_cmd = List(['ssh'], config=True,
544 ssh_cmd = List(['ssh'], config=True,
546 help="command for starting ssh")
545 help="command for starting ssh")
547 ssh_args = List(['-tt'], config=True,
546 ssh_args = List(['-tt'], config=True,
548 help="args to pass to ssh")
547 help="args to pass to ssh")
549 scp_cmd = List(['scp'], config=True,
548 scp_cmd = List(['scp'], config=True,
550 help="command for sending files")
549 help="command for sending files")
551 program = List(['date'],
550 program = List(['date'],
552 help="Program to launch via ssh")
551 help="Program to launch via ssh")
553 program_args = List([],
552 program_args = List([],
554 help="args to pass to remote program")
553 help="args to pass to remote program")
555 hostname = Unicode('', config=True,
554 hostname = Unicode('', config=True,
556 help="hostname on which to launch the program")
555 help="hostname on which to launch the program")
557 user = Unicode('', config=True,
556 user = Unicode('', config=True,
558 help="username for ssh")
557 help="username for ssh")
559 location = Unicode('', config=True,
558 location = Unicode('', config=True,
560 help="user@hostname location for ssh in one setting")
559 help="user@hostname location for ssh in one setting")
561 to_fetch = List([], config=True,
560 to_fetch = List([], config=True,
562 help="List of (remote, local) files to fetch after starting")
561 help="List of (remote, local) files to fetch after starting")
563 to_send = List([], config=True,
562 to_send = List([], config=True,
564 help="List of (local, remote) files to send before starting")
563 help="List of (local, remote) files to send before starting")
565
564
566 def _hostname_changed(self, name, old, new):
565 def _hostname_changed(self, name, old, new):
567 if self.user:
566 if self.user:
568 self.location = u'%s@%s' % (self.user, new)
567 self.location = u'%s@%s' % (self.user, new)
569 else:
568 else:
570 self.location = new
569 self.location = new
571
570
572 def _user_changed(self, name, old, new):
571 def _user_changed(self, name, old, new):
573 self.location = u'%s@%s' % (new, self.hostname)
572 self.location = u'%s@%s' % (new, self.hostname)
574
573
575 def find_args(self):
574 def find_args(self):
576 return self.ssh_cmd + self.ssh_args + [self.location] + \
575 return self.ssh_cmd + self.ssh_args + [self.location] + \
577 list(map(pipes.quote, self.program + self.program_args))
576 list(map(pipes.quote, self.program + self.program_args))
578
577
579 def _send_file(self, local, remote):
578 def _send_file(self, local, remote):
580 """send a single file"""
579 """send a single file"""
581 full_remote = "%s:%s" % (self.location, remote)
580 full_remote = "%s:%s" % (self.location, remote)
582 for i in range(10):
581 for i in range(10):
583 if not os.path.exists(local):
582 if not os.path.exists(local):
584 self.log.debug("waiting for %s" % local)
583 self.log.debug("waiting for %s" % local)
585 time.sleep(1)
584 time.sleep(1)
586 else:
585 else:
587 break
586 break
588 remote_dir = os.path.dirname(remote)
587 remote_dir = os.path.dirname(remote)
589 self.log.info("ensuring remote %s:%s/ exists", self.location, remote_dir)
588 self.log.info("ensuring remote %s:%s/ exists", self.location, remote_dir)
590 check_output(self.ssh_cmd + self.ssh_args + \
589 check_output(self.ssh_cmd + self.ssh_args + \
591 [self.location, 'mkdir', '-p', '--', remote_dir]
590 [self.location, 'mkdir', '-p', '--', remote_dir]
592 )
591 )
593 self.log.info("sending %s to %s", local, full_remote)
592 self.log.info("sending %s to %s", local, full_remote)
594 check_output(self.scp_cmd + [local, full_remote])
593 check_output(self.scp_cmd + [local, full_remote])
595
594
596 def send_files(self):
595 def send_files(self):
597 """send our files (called before start)"""
596 """send our files (called before start)"""
598 if not self.to_send:
597 if not self.to_send:
599 return
598 return
600 for local_file, remote_file in self.to_send:
599 for local_file, remote_file in self.to_send:
601 self._send_file(local_file, remote_file)
600 self._send_file(local_file, remote_file)
602
601
603 def _fetch_file(self, remote, local):
602 def _fetch_file(self, remote, local):
604 """fetch a single file"""
603 """fetch a single file"""
605 full_remote = "%s:%s" % (self.location, remote)
604 full_remote = "%s:%s" % (self.location, remote)
606 self.log.info("fetching %s from %s", local, full_remote)
605 self.log.info("fetching %s from %s", local, full_remote)
607 for i in range(10):
606 for i in range(10):
608 # wait up to 10s for remote file to exist
607 # wait up to 10s for remote file to exist
609 check = check_output(self.ssh_cmd + self.ssh_args + \
608 check = check_output(self.ssh_cmd + self.ssh_args + \
610 [self.location, 'test -e', remote, "&& echo 'yes' || echo 'no'"])
609 [self.location, 'test -e', remote, "&& echo 'yes' || echo 'no'"])
611 check = check.decode(DEFAULT_ENCODING, 'replace').strip()
610 check = check.decode(DEFAULT_ENCODING, 'replace').strip()
612 if check == u'no':
611 if check == u'no':
613 time.sleep(1)
612 time.sleep(1)
614 elif check == u'yes':
613 elif check == u'yes':
615 break
614 break
616 local_dir = os.path.dirname(local)
615 local_dir = os.path.dirname(local)
617 ensure_dir_exists(local_dir, 775)
616 ensure_dir_exists(local_dir, 775)
618 check_output(self.scp_cmd + [full_remote, local])
617 check_output(self.scp_cmd + [full_remote, local])
619
618
620 def fetch_files(self):
619 def fetch_files(self):
621 """fetch remote files (called after start)"""
620 """fetch remote files (called after start)"""
622 if not self.to_fetch:
621 if not self.to_fetch:
623 return
622 return
624 for remote_file, local_file in self.to_fetch:
623 for remote_file, local_file in self.to_fetch:
625 self._fetch_file(remote_file, local_file)
624 self._fetch_file(remote_file, local_file)
626
625
627 def start(self, hostname=None, user=None):
626 def start(self, hostname=None, user=None):
628 if hostname is not None:
627 if hostname is not None:
629 self.hostname = hostname
628 self.hostname = hostname
630 if user is not None:
629 if user is not None:
631 self.user = user
630 self.user = user
632
631
633 self.send_files()
632 self.send_files()
634 super(SSHLauncher, self).start()
633 super(SSHLauncher, self).start()
635 self.fetch_files()
634 self.fetch_files()
636
635
637 def signal(self, sig):
636 def signal(self, sig):
638 if self.state == 'running':
637 if self.state == 'running':
639 # send escaped ssh connection-closer
638 # send escaped ssh connection-closer
640 self.process.stdin.write('~.')
639 self.process.stdin.write('~.')
641 self.process.stdin.flush()
640 self.process.stdin.flush()
642
641
643 class SSHClusterLauncher(SSHLauncher, ClusterAppMixin):
642 class SSHClusterLauncher(SSHLauncher, ClusterAppMixin):
644
643
645 remote_profile_dir = Unicode('', config=True,
644 remote_profile_dir = Unicode('', config=True,
646 help="""The remote profile_dir to use.
645 help="""The remote profile_dir to use.
647
646
648 If not specified, use calling profile, stripping out possible leading homedir.
647 If not specified, use calling profile, stripping out possible leading homedir.
649 """)
648 """)
650
649
651 def _profile_dir_changed(self, name, old, new):
650 def _profile_dir_changed(self, name, old, new):
652 if not self.remote_profile_dir:
651 if not self.remote_profile_dir:
653 # trigger remote_profile_dir_default logic again,
652 # trigger remote_profile_dir_default logic again,
654 # in case it was already triggered before profile_dir was set
653 # in case it was already triggered before profile_dir was set
655 self.remote_profile_dir = self._strip_home(new)
654 self.remote_profile_dir = self._strip_home(new)
656
655
657 @staticmethod
656 @staticmethod
658 def _strip_home(path):
657 def _strip_home(path):
659 """turns /home/you/.ipython/profile_foo into .ipython/profile_foo"""
658 """turns /home/you/.ipython/profile_foo into .ipython/profile_foo"""
660 home = get_home_dir()
659 home = get_home_dir()
661 if not home.endswith('/'):
660 if not home.endswith('/'):
662 home = home+'/'
661 home = home+'/'
663
662
664 if path.startswith(home):
663 if path.startswith(home):
665 return path[len(home):]
664 return path[len(home):]
666 else:
665 else:
667 return path
666 return path
668
667
669 def _remote_profile_dir_default(self):
668 def _remote_profile_dir_default(self):
670 return self._strip_home(self.profile_dir)
669 return self._strip_home(self.profile_dir)
671
670
672 def _cluster_id_changed(self, name, old, new):
671 def _cluster_id_changed(self, name, old, new):
673 if new:
672 if new:
674 raise ValueError("cluster id not supported by SSH launchers")
673 raise ValueError("cluster id not supported by SSH launchers")
675
674
676 @property
675 @property
677 def cluster_args(self):
676 def cluster_args(self):
678 return ['--profile-dir', self.remote_profile_dir]
677 return ['--profile-dir', self.remote_profile_dir]
679
678
680 class SSHControllerLauncher(SSHClusterLauncher, ControllerMixin):
679 class SSHControllerLauncher(SSHClusterLauncher, ControllerMixin):
681
680
682 # alias back to *non-configurable* program[_args] for use in find_args()
681 # alias back to *non-configurable* program[_args] for use in find_args()
683 # this way all Controller/EngineSetLaunchers have the same form, rather
682 # this way all Controller/EngineSetLaunchers have the same form, rather
684 # than *some* having `program_args` and others `controller_args`
683 # than *some* having `program_args` and others `controller_args`
685
684
686 def _controller_cmd_default(self):
685 def _controller_cmd_default(self):
687 return ['ipcontroller']
686 return ['ipcontroller']
688
687
689 @property
688 @property
690 def program(self):
689 def program(self):
691 return self.controller_cmd
690 return self.controller_cmd
692
691
693 @property
692 @property
694 def program_args(self):
693 def program_args(self):
695 return self.cluster_args + self.controller_args
694 return self.cluster_args + self.controller_args
696
695
697 def _to_fetch_default(self):
696 def _to_fetch_default(self):
698 return [
697 return [
699 (os.path.join(self.remote_profile_dir, 'security', cf),
698 (os.path.join(self.remote_profile_dir, 'security', cf),
700 os.path.join(self.profile_dir, 'security', cf),)
699 os.path.join(self.profile_dir, 'security', cf),)
701 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
700 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
702 ]
701 ]
703
702
704 class SSHEngineLauncher(SSHClusterLauncher, EngineMixin):
703 class SSHEngineLauncher(SSHClusterLauncher, EngineMixin):
705
704
706 # alias back to *non-configurable* program[_args] for use in find_args()
705 # alias back to *non-configurable* program[_args] for use in find_args()
707 # this way all Controller/EngineSetLaunchers have the same form, rather
706 # this way all Controller/EngineSetLaunchers have the same form, rather
708 # than *some* having `program_args` and others `controller_args`
707 # than *some* having `program_args` and others `controller_args`
709
708
710 def _engine_cmd_default(self):
709 def _engine_cmd_default(self):
711 return ['ipengine']
710 return ['ipengine']
712
711
713 @property
712 @property
714 def program(self):
713 def program(self):
715 return self.engine_cmd
714 return self.engine_cmd
716
715
717 @property
716 @property
718 def program_args(self):
717 def program_args(self):
719 return self.cluster_args + self.engine_args
718 return self.cluster_args + self.engine_args
720
719
721 def _to_send_default(self):
720 def _to_send_default(self):
722 return [
721 return [
723 (os.path.join(self.profile_dir, 'security', cf),
722 (os.path.join(self.profile_dir, 'security', cf),
724 os.path.join(self.remote_profile_dir, 'security', cf))
723 os.path.join(self.remote_profile_dir, 'security', cf))
725 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
724 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
726 ]
725 ]
727
726
728
727
729 class SSHEngineSetLauncher(LocalEngineSetLauncher):
728 class SSHEngineSetLauncher(LocalEngineSetLauncher):
730 launcher_class = SSHEngineLauncher
729 launcher_class = SSHEngineLauncher
731 engines = Dict(config=True,
730 engines = Dict(config=True,
732 help="""dict of engines to launch. This is a dict by hostname of ints,
731 help="""dict of engines to launch. This is a dict by hostname of ints,
733 corresponding to the number of engines to start on that host.""")
732 corresponding to the number of engines to start on that host.""")
734
733
735 def _engine_cmd_default(self):
734 def _engine_cmd_default(self):
736 return ['ipengine']
735 return ['ipengine']
737
736
738 @property
737 @property
739 def engine_count(self):
738 def engine_count(self):
740 """determine engine count from `engines` dict"""
739 """determine engine count from `engines` dict"""
741 count = 0
740 count = 0
742 for n in itervalues(self.engines):
741 for n in itervalues(self.engines):
743 if isinstance(n, (tuple,list)):
742 if isinstance(n, (tuple,list)):
744 n,args = n
743 n,args = n
745 count += n
744 count += n
746 return count
745 return count
747
746
748 def start(self, n):
747 def start(self, n):
749 """Start engines by profile or profile_dir.
748 """Start engines by profile or profile_dir.
750 `n` is ignored, and the `engines` config property is used instead.
749 `n` is ignored, and the `engines` config property is used instead.
751 """
750 """
752
751
753 dlist = []
752 dlist = []
754 for host, n in iteritems(self.engines):
753 for host, n in iteritems(self.engines):
755 if isinstance(n, (tuple, list)):
754 if isinstance(n, (tuple, list)):
756 n, args = n
755 n, args = n
757 else:
756 else:
758 args = copy.deepcopy(self.engine_args)
757 args = copy.deepcopy(self.engine_args)
759
758
760 if '@' in host:
759 if '@' in host:
761 user,host = host.split('@',1)
760 user,host = host.split('@',1)
762 else:
761 else:
763 user=None
762 user=None
764 for i in range(n):
763 for i in range(n):
765 if i > 0:
764 if i > 0:
766 time.sleep(self.delay)
765 time.sleep(self.delay)
767 el = self.launcher_class(work_dir=self.work_dir, parent=self, log=self.log,
766 el = self.launcher_class(work_dir=self.work_dir, parent=self, log=self.log,
768 profile_dir=self.profile_dir, cluster_id=self.cluster_id,
767 profile_dir=self.profile_dir, cluster_id=self.cluster_id,
769 )
768 )
770 if i > 0:
769 if i > 0:
771 # only send files for the first engine on each host
770 # only send files for the first engine on each host
772 el.to_send = []
771 el.to_send = []
773
772
774 # Copy the engine args over to each engine launcher.
773 # Copy the engine args over to each engine launcher.
775 el.engine_cmd = self.engine_cmd
774 el.engine_cmd = self.engine_cmd
776 el.engine_args = args
775 el.engine_args = args
777 el.on_stop(self._notice_engine_stopped)
776 el.on_stop(self._notice_engine_stopped)
778 d = el.start(user=user, hostname=host)
777 d = el.start(user=user, hostname=host)
779 self.launchers[ "%s/%i" % (host,i) ] = el
778 self.launchers[ "%s/%i" % (host,i) ] = el
780 dlist.append(d)
779 dlist.append(d)
781 self.notify_start(dlist)
780 self.notify_start(dlist)
782 return dlist
781 return dlist
783
782
784
783
785 class SSHProxyEngineSetLauncher(SSHClusterLauncher):
784 class SSHProxyEngineSetLauncher(SSHClusterLauncher):
786 """Launcher for calling
785 """Launcher for calling
787 `ipcluster engines` on a remote machine.
786 `ipcluster engines` on a remote machine.
788
787
789 Requires that remote profile is already configured.
788 Requires that remote profile is already configured.
790 """
789 """
791
790
792 n = Integer()
791 n = Integer()
793 ipcluster_cmd = List(['ipcluster'], config=True)
792 ipcluster_cmd = List(['ipcluster'], config=True)
794
793
795 @property
794 @property
796 def program(self):
795 def program(self):
797 return self.ipcluster_cmd + ['engines']
796 return self.ipcluster_cmd + ['engines']
798
797
799 @property
798 @property
800 def program_args(self):
799 def program_args(self):
801 return ['-n', str(self.n), '--profile-dir', self.remote_profile_dir]
800 return ['-n', str(self.n), '--profile-dir', self.remote_profile_dir]
802
801
803 def _to_send_default(self):
802 def _to_send_default(self):
804 return [
803 return [
805 (os.path.join(self.profile_dir, 'security', cf),
804 (os.path.join(self.profile_dir, 'security', cf),
806 os.path.join(self.remote_profile_dir, 'security', cf))
805 os.path.join(self.remote_profile_dir, 'security', cf))
807 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
806 for cf in ('ipcontroller-client.json', 'ipcontroller-engine.json')
808 ]
807 ]
809
808
810 def start(self, n):
809 def start(self, n):
811 self.n = n
810 self.n = n
812 super(SSHProxyEngineSetLauncher, self).start()
811 super(SSHProxyEngineSetLauncher, self).start()
813
812
814
813
815 #-----------------------------------------------------------------------------
814 #-----------------------------------------------------------------------------
816 # Windows HPC Server 2008 scheduler launchers
815 # Windows HPC Server 2008 scheduler launchers
817 #-----------------------------------------------------------------------------
816 #-----------------------------------------------------------------------------
818
817
819
818
820 # This is only used on Windows.
819 # This is only used on Windows.
821 def find_job_cmd():
820 def find_job_cmd():
822 if WINDOWS:
821 if WINDOWS:
823 try:
822 try:
824 return find_cmd('job')
823 return find_cmd('job')
825 except (FindCmdError, ImportError):
824 except (FindCmdError, ImportError):
826 # ImportError will be raised if win32api is not installed
825 # ImportError will be raised if win32api is not installed
827 return 'job'
826 return 'job'
828 else:
827 else:
829 return 'job'
828 return 'job'
830
829
831
830
832 class WindowsHPCLauncher(BaseLauncher):
831 class WindowsHPCLauncher(BaseLauncher):
833
832
834 job_id_regexp = CRegExp(r'\d+', config=True,
833 job_id_regexp = CRegExp(r'\d+', config=True,
835 help="""A regular expression used to get the job id from the output of the
834 help="""A regular expression used to get the job id from the output of the
836 submit_command. """
835 submit_command. """
837 )
836 )
838 job_file_name = Unicode(u'ipython_job.xml', config=True,
837 job_file_name = Unicode(u'ipython_job.xml', config=True,
839 help="The filename of the instantiated job script.")
838 help="The filename of the instantiated job script.")
840 # The full path to the instantiated job script. This gets made dynamically
839 # The full path to the instantiated job script. This gets made dynamically
841 # by combining the work_dir with the job_file_name.
840 # by combining the work_dir with the job_file_name.
842 job_file = Unicode(u'')
841 job_file = Unicode(u'')
843 scheduler = Unicode('', config=True,
842 scheduler = Unicode('', config=True,
844 help="The hostname of the scheduler to submit the job to.")
843 help="The hostname of the scheduler to submit the job to.")
845 job_cmd = Unicode(find_job_cmd(), config=True,
844 job_cmd = Unicode(find_job_cmd(), config=True,
846 help="The command for submitting jobs.")
845 help="The command for submitting jobs.")
847
846
848 def __init__(self, work_dir=u'.', config=None, **kwargs):
847 def __init__(self, work_dir=u'.', config=None, **kwargs):
849 super(WindowsHPCLauncher, self).__init__(
848 super(WindowsHPCLauncher, self).__init__(
850 work_dir=work_dir, config=config, **kwargs
849 work_dir=work_dir, config=config, **kwargs
851 )
850 )
852
851
853 @property
852 @property
854 def job_file(self):
853 def job_file(self):
855 return os.path.join(self.work_dir, self.job_file_name)
854 return os.path.join(self.work_dir, self.job_file_name)
856
855
857 def write_job_file(self, n):
856 def write_job_file(self, n):
858 raise NotImplementedError("Implement write_job_file in a subclass.")
857 raise NotImplementedError("Implement write_job_file in a subclass.")
859
858
860 def find_args(self):
859 def find_args(self):
861 return [u'job.exe']
860 return [u'job.exe']
862
861
863 def parse_job_id(self, output):
862 def parse_job_id(self, output):
864 """Take the output of the submit command and return the job id."""
863 """Take the output of the submit command and return the job id."""
865 m = self.job_id_regexp.search(output)
864 m = self.job_id_regexp.search(output)
866 if m is not None:
865 if m is not None:
867 job_id = m.group()
866 job_id = m.group()
868 else:
867 else:
869 raise LauncherError("Job id couldn't be determined: %s" % output)
868 raise LauncherError("Job id couldn't be determined: %s" % output)
870 self.job_id = job_id
869 self.job_id = job_id
871 self.log.info('Job started with id: %r', job_id)
870 self.log.info('Job started with id: %r', job_id)
872 return job_id
871 return job_id
873
872
874 def start(self, n):
873 def start(self, n):
875 """Start n copies of the process using the Win HPC job scheduler."""
874 """Start n copies of the process using the Win HPC job scheduler."""
876 self.write_job_file(n)
875 self.write_job_file(n)
877 args = [
876 args = [
878 'submit',
877 'submit',
879 '/jobfile:%s' % self.job_file,
878 '/jobfile:%s' % self.job_file,
880 '/scheduler:%s' % self.scheduler
879 '/scheduler:%s' % self.scheduler
881 ]
880 ]
882 self.log.debug("Starting Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
881 self.log.debug("Starting Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
883
882
884 output = check_output([self.job_cmd]+args,
883 output = check_output([self.job_cmd]+args,
885 env=os.environ,
884 env=os.environ,
886 cwd=self.work_dir,
885 cwd=self.work_dir,
887 stderr=STDOUT
886 stderr=STDOUT
888 )
887 )
889 output = output.decode(DEFAULT_ENCODING, 'replace')
888 output = output.decode(DEFAULT_ENCODING, 'replace')
890 job_id = self.parse_job_id(output)
889 job_id = self.parse_job_id(output)
891 self.notify_start(job_id)
890 self.notify_start(job_id)
892 return job_id
891 return job_id
893
892
894 def stop(self):
893 def stop(self):
895 args = [
894 args = [
896 'cancel',
895 'cancel',
897 self.job_id,
896 self.job_id,
898 '/scheduler:%s' % self.scheduler
897 '/scheduler:%s' % self.scheduler
899 ]
898 ]
900 self.log.info("Stopping Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
899 self.log.info("Stopping Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
901 try:
900 try:
902 output = check_output([self.job_cmd]+args,
901 output = check_output([self.job_cmd]+args,
903 env=os.environ,
902 env=os.environ,
904 cwd=self.work_dir,
903 cwd=self.work_dir,
905 stderr=STDOUT
904 stderr=STDOUT
906 )
905 )
907 output = output.decode(DEFAULT_ENCODING, 'replace')
906 output = output.decode(DEFAULT_ENCODING, 'replace')
908 except:
907 except:
909 output = u'The job already appears to be stopped: %r' % self.job_id
908 output = u'The job already appears to be stopped: %r' % self.job_id
910 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
909 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
911 return output
910 return output
912
911
913
912
914 class WindowsHPCControllerLauncher(WindowsHPCLauncher, ClusterAppMixin):
913 class WindowsHPCControllerLauncher(WindowsHPCLauncher, ClusterAppMixin):
915
914
916 job_file_name = Unicode(u'ipcontroller_job.xml', config=True,
915 job_file_name = Unicode(u'ipcontroller_job.xml', config=True,
917 help="WinHPC xml job file.")
916 help="WinHPC xml job file.")
918 controller_args = List([], config=False,
917 controller_args = List([], config=False,
919 help="extra args to pass to ipcontroller")
918 help="extra args to pass to ipcontroller")
920
919
921 def write_job_file(self, n):
920 def write_job_file(self, n):
922 job = IPControllerJob(parent=self)
921 job = IPControllerJob(parent=self)
923
922
924 t = IPControllerTask(parent=self)
923 t = IPControllerTask(parent=self)
925 # The tasks work directory is *not* the actual work directory of
924 # The tasks work directory is *not* the actual work directory of
926 # the controller. It is used as the base path for the stdout/stderr
925 # the controller. It is used as the base path for the stdout/stderr
927 # files that the scheduler redirects to.
926 # files that the scheduler redirects to.
928 t.work_directory = self.profile_dir
927 t.work_directory = self.profile_dir
929 # Add the profile_dir and from self.start().
928 # Add the profile_dir and from self.start().
930 t.controller_args.extend(self.cluster_args)
929 t.controller_args.extend(self.cluster_args)
931 t.controller_args.extend(self.controller_args)
930 t.controller_args.extend(self.controller_args)
932 job.add_task(t)
931 job.add_task(t)
933
932
934 self.log.debug("Writing job description file: %s", self.job_file)
933 self.log.debug("Writing job description file: %s", self.job_file)
935 job.write(self.job_file)
934 job.write(self.job_file)
936
935
937 @property
936 @property
938 def job_file(self):
937 def job_file(self):
939 return os.path.join(self.profile_dir, self.job_file_name)
938 return os.path.join(self.profile_dir, self.job_file_name)
940
939
941 def start(self):
940 def start(self):
942 """Start the controller by profile_dir."""
941 """Start the controller by profile_dir."""
943 return super(WindowsHPCControllerLauncher, self).start(1)
942 return super(WindowsHPCControllerLauncher, self).start(1)
944
943
945
944
946 class WindowsHPCEngineSetLauncher(WindowsHPCLauncher, ClusterAppMixin):
945 class WindowsHPCEngineSetLauncher(WindowsHPCLauncher, ClusterAppMixin):
947
946
948 job_file_name = Unicode(u'ipengineset_job.xml', config=True,
947 job_file_name = Unicode(u'ipengineset_job.xml', config=True,
949 help="jobfile for ipengines job")
948 help="jobfile for ipengines job")
950 engine_args = List([], config=False,
949 engine_args = List([], config=False,
951 help="extra args to pas to ipengine")
950 help="extra args to pas to ipengine")
952
951
953 def write_job_file(self, n):
952 def write_job_file(self, n):
954 job = IPEngineSetJob(parent=self)
953 job = IPEngineSetJob(parent=self)
955
954
956 for i in range(n):
955 for i in range(n):
957 t = IPEngineTask(parent=self)
956 t = IPEngineTask(parent=self)
958 # The tasks work directory is *not* the actual work directory of
957 # The tasks work directory is *not* the actual work directory of
959 # the engine. It is used as the base path for the stdout/stderr
958 # the engine. It is used as the base path for the stdout/stderr
960 # files that the scheduler redirects to.
959 # files that the scheduler redirects to.
961 t.work_directory = self.profile_dir
960 t.work_directory = self.profile_dir
962 # Add the profile_dir and from self.start().
961 # Add the profile_dir and from self.start().
963 t.engine_args.extend(self.cluster_args)
962 t.engine_args.extend(self.cluster_args)
964 t.engine_args.extend(self.engine_args)
963 t.engine_args.extend(self.engine_args)
965 job.add_task(t)
964 job.add_task(t)
966
965
967 self.log.debug("Writing job description file: %s", self.job_file)
966 self.log.debug("Writing job description file: %s", self.job_file)
968 job.write(self.job_file)
967 job.write(self.job_file)
969
968
970 @property
969 @property
971 def job_file(self):
970 def job_file(self):
972 return os.path.join(self.profile_dir, self.job_file_name)
971 return os.path.join(self.profile_dir, self.job_file_name)
973
972
974 def start(self, n):
973 def start(self, n):
975 """Start the controller by profile_dir."""
974 """Start the controller by profile_dir."""
976 return super(WindowsHPCEngineSetLauncher, self).start(n)
975 return super(WindowsHPCEngineSetLauncher, self).start(n)
977
976
978
977
979 #-----------------------------------------------------------------------------
978 #-----------------------------------------------------------------------------
980 # Batch (PBS) system launchers
979 # Batch (PBS) system launchers
981 #-----------------------------------------------------------------------------
980 #-----------------------------------------------------------------------------
982
981
983 class BatchClusterAppMixin(ClusterAppMixin):
982 class BatchClusterAppMixin(ClusterAppMixin):
984 """ClusterApp mixin that updates the self.context dict, rather than cl-args."""
983 """ClusterApp mixin that updates the self.context dict, rather than cl-args."""
985 def _profile_dir_changed(self, name, old, new):
984 def _profile_dir_changed(self, name, old, new):
986 self.context[name] = new
985 self.context[name] = new
987 _cluster_id_changed = _profile_dir_changed
986 _cluster_id_changed = _profile_dir_changed
988
987
989 def _profile_dir_default(self):
988 def _profile_dir_default(self):
990 self.context['profile_dir'] = ''
989 self.context['profile_dir'] = ''
991 return ''
990 return ''
992 def _cluster_id_default(self):
991 def _cluster_id_default(self):
993 self.context['cluster_id'] = ''
992 self.context['cluster_id'] = ''
994 return ''
993 return ''
995
994
996
995
997 class BatchSystemLauncher(BaseLauncher):
996 class BatchSystemLauncher(BaseLauncher):
998 """Launch an external process using a batch system.
997 """Launch an external process using a batch system.
999
998
1000 This class is designed to work with UNIX batch systems like PBS, LSF,
999 This class is designed to work with UNIX batch systems like PBS, LSF,
1001 GridEngine, etc. The overall model is that there are different commands
1000 GridEngine, etc. The overall model is that there are different commands
1002 like qsub, qdel, etc. that handle the starting and stopping of the process.
1001 like qsub, qdel, etc. that handle the starting and stopping of the process.
1003
1002
1004 This class also has the notion of a batch script. The ``batch_template``
1003 This class also has the notion of a batch script. The ``batch_template``
1005 attribute can be set to a string that is a template for the batch script.
1004 attribute can be set to a string that is a template for the batch script.
1006 This template is instantiated using string formatting. Thus the template can
1005 This template is instantiated using string formatting. Thus the template can
1007 use {n} fot the number of instances. Subclasses can add additional variables
1006 use {n} fot the number of instances. Subclasses can add additional variables
1008 to the template dict.
1007 to the template dict.
1009 """
1008 """
1010
1009
1011 # Subclasses must fill these in. See PBSEngineSet
1010 # Subclasses must fill these in. See PBSEngineSet
1012 submit_command = List([''], config=True,
1011 submit_command = List([''], config=True,
1013 help="The name of the command line program used to submit jobs.")
1012 help="The name of the command line program used to submit jobs.")
1014 delete_command = List([''], config=True,
1013 delete_command = List([''], config=True,
1015 help="The name of the command line program used to delete jobs.")
1014 help="The name of the command line program used to delete jobs.")
1016 job_id_regexp = CRegExp('', config=True,
1015 job_id_regexp = CRegExp('', config=True,
1017 help="""A regular expression used to get the job id from the output of the
1016 help="""A regular expression used to get the job id from the output of the
1018 submit_command.""")
1017 submit_command.""")
1019 job_id_regexp_group = Integer(0, config=True,
1018 job_id_regexp_group = Integer(0, config=True,
1020 help="""The group we wish to match in job_id_regexp (0 to match all)""")
1019 help="""The group we wish to match in job_id_regexp (0 to match all)""")
1021 batch_template = Unicode('', config=True,
1020 batch_template = Unicode('', config=True,
1022 help="The string that is the batch script template itself.")
1021 help="The string that is the batch script template itself.")
1023 batch_template_file = Unicode(u'', config=True,
1022 batch_template_file = Unicode(u'', config=True,
1024 help="The file that contains the batch template.")
1023 help="The file that contains the batch template.")
1025 batch_file_name = Unicode(u'batch_script', config=True,
1024 batch_file_name = Unicode(u'batch_script', config=True,
1026 help="The filename of the instantiated batch script.")
1025 help="The filename of the instantiated batch script.")
1027 queue = Unicode(u'', config=True,
1026 queue = Unicode(u'', config=True,
1028 help="The PBS Queue.")
1027 help="The PBS Queue.")
1029
1028
1030 def _queue_changed(self, name, old, new):
1029 def _queue_changed(self, name, old, new):
1031 self.context[name] = new
1030 self.context[name] = new
1032
1031
1033 n = Integer(1)
1032 n = Integer(1)
1034 _n_changed = _queue_changed
1033 _n_changed = _queue_changed
1035
1034
1036 # not configurable, override in subclasses
1035 # not configurable, override in subclasses
1037 # PBS Job Array regex
1036 # PBS Job Array regex
1038 job_array_regexp = CRegExp('')
1037 job_array_regexp = CRegExp('')
1039 job_array_template = Unicode('')
1038 job_array_template = Unicode('')
1040 # PBS Queue regex
1039 # PBS Queue regex
1041 queue_regexp = CRegExp('')
1040 queue_regexp = CRegExp('')
1042 queue_template = Unicode('')
1041 queue_template = Unicode('')
1043 # The default batch template, override in subclasses
1042 # The default batch template, override in subclasses
1044 default_template = Unicode('')
1043 default_template = Unicode('')
1045 # The full path to the instantiated batch script.
1044 # The full path to the instantiated batch script.
1046 batch_file = Unicode(u'')
1045 batch_file = Unicode(u'')
1047 # the format dict used with batch_template:
1046 # the format dict used with batch_template:
1048 context = Dict()
1047 context = Dict()
1049
1048
1050 def _context_default(self):
1049 def _context_default(self):
1051 """load the default context with the default values for the basic keys
1050 """load the default context with the default values for the basic keys
1052
1051
1053 because the _trait_changed methods only load the context if they
1052 because the _trait_changed methods only load the context if they
1054 are set to something other than the default value.
1053 are set to something other than the default value.
1055 """
1054 """
1056 return dict(n=1, queue=u'', profile_dir=u'', cluster_id=u'')
1055 return dict(n=1, queue=u'', profile_dir=u'', cluster_id=u'')
1057
1056
1058 # the Formatter instance for rendering the templates:
1057 # the Formatter instance for rendering the templates:
1059 formatter = Instance(EvalFormatter, (), {})
1058 formatter = Instance(EvalFormatter, (), {})
1060
1059
1061 def find_args(self):
1060 def find_args(self):
1062 return self.submit_command + [self.batch_file]
1061 return self.submit_command + [self.batch_file]
1063
1062
1064 def __init__(self, work_dir=u'.', config=None, **kwargs):
1063 def __init__(self, work_dir=u'.', config=None, **kwargs):
1065 super(BatchSystemLauncher, self).__init__(
1064 super(BatchSystemLauncher, self).__init__(
1066 work_dir=work_dir, config=config, **kwargs
1065 work_dir=work_dir, config=config, **kwargs
1067 )
1066 )
1068 self.batch_file = os.path.join(self.work_dir, self.batch_file_name)
1067 self.batch_file = os.path.join(self.work_dir, self.batch_file_name)
1069
1068
1070 def parse_job_id(self, output):
1069 def parse_job_id(self, output):
1071 """Take the output of the submit command and return the job id."""
1070 """Take the output of the submit command and return the job id."""
1072 m = self.job_id_regexp.search(output)
1071 m = self.job_id_regexp.search(output)
1073 if m is not None:
1072 if m is not None:
1074 job_id = m.group(self.job_id_regexp_group)
1073 job_id = m.group(self.job_id_regexp_group)
1075 else:
1074 else:
1076 raise LauncherError("Job id couldn't be determined: %s" % output)
1075 raise LauncherError("Job id couldn't be determined: %s" % output)
1077 self.job_id = job_id
1076 self.job_id = job_id
1078 self.log.info('Job submitted with job id: %r', job_id)
1077 self.log.info('Job submitted with job id: %r', job_id)
1079 return job_id
1078 return job_id
1080
1079
1081 def write_batch_script(self, n):
1080 def write_batch_script(self, n):
1082 """Instantiate and write the batch script to the work_dir."""
1081 """Instantiate and write the batch script to the work_dir."""
1083 self.n = n
1082 self.n = n
1084 # first priority is batch_template if set
1083 # first priority is batch_template if set
1085 if self.batch_template_file and not self.batch_template:
1084 if self.batch_template_file and not self.batch_template:
1086 # second priority is batch_template_file
1085 # second priority is batch_template_file
1087 with open(self.batch_template_file) as f:
1086 with open(self.batch_template_file) as f:
1088 self.batch_template = f.read()
1087 self.batch_template = f.read()
1089 if not self.batch_template:
1088 if not self.batch_template:
1090 # third (last) priority is default_template
1089 # third (last) priority is default_template
1091 self.batch_template = self.default_template
1090 self.batch_template = self.default_template
1092 # add jobarray or queue lines to user-specified template
1091 # add jobarray or queue lines to user-specified template
1093 # note that this is *only* when user did not specify a template.
1092 # note that this is *only* when user did not specify a template.
1094 self._insert_queue_in_script()
1093 self._insert_queue_in_script()
1095 self._insert_job_array_in_script()
1094 self._insert_job_array_in_script()
1096 script_as_string = self.formatter.format(self.batch_template, **self.context)
1095 script_as_string = self.formatter.format(self.batch_template, **self.context)
1097 self.log.debug('Writing batch script: %s', self.batch_file)
1096 self.log.debug('Writing batch script: %s', self.batch_file)
1098 with open(self.batch_file, 'w') as f:
1097 with open(self.batch_file, 'w') as f:
1099 f.write(script_as_string)
1098 f.write(script_as_string)
1100 os.chmod(self.batch_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
1099 os.chmod(self.batch_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
1101
1100
1102 def _insert_queue_in_script(self):
1101 def _insert_queue_in_script(self):
1103 """Inserts a queue if required into the batch script.
1102 """Inserts a queue if required into the batch script.
1104 """
1103 """
1105 if self.queue and not self.queue_regexp.search(self.batch_template):
1104 if self.queue and not self.queue_regexp.search(self.batch_template):
1106 self.log.debug("adding PBS queue settings to batch script")
1105 self.log.debug("adding PBS queue settings to batch script")
1107 firstline, rest = self.batch_template.split('\n',1)
1106 firstline, rest = self.batch_template.split('\n',1)
1108 self.batch_template = u'\n'.join([firstline, self.queue_template, rest])
1107 self.batch_template = u'\n'.join([firstline, self.queue_template, rest])
1109
1108
1110 def _insert_job_array_in_script(self):
1109 def _insert_job_array_in_script(self):
1111 """Inserts a job array if required into the batch script.
1110 """Inserts a job array if required into the batch script.
1112 """
1111 """
1113 if not self.job_array_regexp.search(self.batch_template):
1112 if not self.job_array_regexp.search(self.batch_template):
1114 self.log.debug("adding job array settings to batch script")
1113 self.log.debug("adding job array settings to batch script")
1115 firstline, rest = self.batch_template.split('\n',1)
1114 firstline, rest = self.batch_template.split('\n',1)
1116 self.batch_template = u'\n'.join([firstline, self.job_array_template, rest])
1115 self.batch_template = u'\n'.join([firstline, self.job_array_template, rest])
1117
1116
1118 def start(self, n):
1117 def start(self, n):
1119 """Start n copies of the process using a batch system."""
1118 """Start n copies of the process using a batch system."""
1120 self.log.debug("Starting %s: %r", self.__class__.__name__, self.args)
1119 self.log.debug("Starting %s: %r", self.__class__.__name__, self.args)
1121 # Here we save profile_dir in the context so they
1120 # Here we save profile_dir in the context so they
1122 # can be used in the batch script template as {profile_dir}
1121 # can be used in the batch script template as {profile_dir}
1123 self.write_batch_script(n)
1122 self.write_batch_script(n)
1124 output = check_output(self.args, env=os.environ)
1123 output = check_output(self.args, env=os.environ)
1125 output = output.decode(DEFAULT_ENCODING, 'replace')
1124 output = output.decode(DEFAULT_ENCODING, 'replace')
1126
1125
1127 job_id = self.parse_job_id(output)
1126 job_id = self.parse_job_id(output)
1128 self.notify_start(job_id)
1127 self.notify_start(job_id)
1129 return job_id
1128 return job_id
1130
1129
1131 def stop(self):
1130 def stop(self):
1132 try:
1131 try:
1133 p = Popen(self.delete_command+[self.job_id], env=os.environ,
1132 p = Popen(self.delete_command+[self.job_id], env=os.environ,
1134 stdout=PIPE, stderr=PIPE)
1133 stdout=PIPE, stderr=PIPE)
1135 out, err = p.communicate()
1134 out, err = p.communicate()
1136 output = out + err
1135 output = out + err
1137 except:
1136 except:
1138 self.log.exception("Problem stopping cluster with command: %s" %
1137 self.log.exception("Problem stopping cluster with command: %s" %
1139 (self.delete_command + [self.job_id]))
1138 (self.delete_command + [self.job_id]))
1140 output = ""
1139 output = ""
1141 output = output.decode(DEFAULT_ENCODING, 'replace')
1140 output = output.decode(DEFAULT_ENCODING, 'replace')
1142 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
1141 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
1143 return output
1142 return output
1144
1143
1145
1144
1146 class PBSLauncher(BatchSystemLauncher):
1145 class PBSLauncher(BatchSystemLauncher):
1147 """A BatchSystemLauncher subclass for PBS."""
1146 """A BatchSystemLauncher subclass for PBS."""
1148
1147
1149 submit_command = List(['qsub'], config=True,
1148 submit_command = List(['qsub'], config=True,
1150 help="The PBS submit command ['qsub']")
1149 help="The PBS submit command ['qsub']")
1151 delete_command = List(['qdel'], config=True,
1150 delete_command = List(['qdel'], config=True,
1152 help="The PBS delete command ['qsub']")
1151 help="The PBS delete command ['qsub']")
1153 job_id_regexp = CRegExp(r'\d+', config=True,
1152 job_id_regexp = CRegExp(r'\d+', config=True,
1154 help="Regular expresion for identifying the job ID [r'\d+']")
1153 help="Regular expresion for identifying the job ID [r'\d+']")
1155
1154
1156 batch_file = Unicode(u'')
1155 batch_file = Unicode(u'')
1157 job_array_regexp = CRegExp('#PBS\W+-t\W+[\w\d\-\$]+')
1156 job_array_regexp = CRegExp('#PBS\W+-t\W+[\w\d\-\$]+')
1158 job_array_template = Unicode('#PBS -t 1-{n}')
1157 job_array_template = Unicode('#PBS -t 1-{n}')
1159 queue_regexp = CRegExp('#PBS\W+-q\W+\$?\w+')
1158 queue_regexp = CRegExp('#PBS\W+-q\W+\$?\w+')
1160 queue_template = Unicode('#PBS -q {queue}')
1159 queue_template = Unicode('#PBS -q {queue}')
1161
1160
1162
1161
1163 class PBSControllerLauncher(PBSLauncher, BatchClusterAppMixin):
1162 class PBSControllerLauncher(PBSLauncher, BatchClusterAppMixin):
1164 """Launch a controller using PBS."""
1163 """Launch a controller using PBS."""
1165
1164
1166 batch_file_name = Unicode(u'pbs_controller', config=True,
1165 batch_file_name = Unicode(u'pbs_controller', config=True,
1167 help="batch file name for the controller job.")
1166 help="batch file name for the controller job.")
1168 default_template= Unicode("""#!/bin/sh
1167 default_template= Unicode("""#!/bin/sh
1169 #PBS -V
1168 #PBS -V
1170 #PBS -N ipcontroller
1169 #PBS -N ipcontroller
1171 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1170 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1172 """%(' '.join(map(pipes.quote, ipcontroller_cmd_argv))))
1171 """%(' '.join(map(pipes.quote, ipcontroller_cmd_argv))))
1173
1172
1174 def start(self):
1173 def start(self):
1175 """Start the controller by profile or profile_dir."""
1174 """Start the controller by profile or profile_dir."""
1176 return super(PBSControllerLauncher, self).start(1)
1175 return super(PBSControllerLauncher, self).start(1)
1177
1176
1178
1177
1179 class PBSEngineSetLauncher(PBSLauncher, BatchClusterAppMixin):
1178 class PBSEngineSetLauncher(PBSLauncher, BatchClusterAppMixin):
1180 """Launch Engines using PBS"""
1179 """Launch Engines using PBS"""
1181 batch_file_name = Unicode(u'pbs_engines', config=True,
1180 batch_file_name = Unicode(u'pbs_engines', config=True,
1182 help="batch file name for the engine(s) job.")
1181 help="batch file name for the engine(s) job.")
1183 default_template= Unicode(u"""#!/bin/sh
1182 default_template= Unicode(u"""#!/bin/sh
1184 #PBS -V
1183 #PBS -V
1185 #PBS -N ipengine
1184 #PBS -N ipengine
1186 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1185 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1187 """%(' '.join(map(pipes.quote,ipengine_cmd_argv))))
1186 """%(' '.join(map(pipes.quote,ipengine_cmd_argv))))
1188
1187
1189
1188
1190 #SGE is very similar to PBS
1189 #SGE is very similar to PBS
1191
1190
1192 class SGELauncher(PBSLauncher):
1191 class SGELauncher(PBSLauncher):
1193 """Sun GridEngine is a PBS clone with slightly different syntax"""
1192 """Sun GridEngine is a PBS clone with slightly different syntax"""
1194 job_array_regexp = CRegExp('#\$\W+\-t')
1193 job_array_regexp = CRegExp('#\$\W+\-t')
1195 job_array_template = Unicode('#$ -t 1-{n}')
1194 job_array_template = Unicode('#$ -t 1-{n}')
1196 queue_regexp = CRegExp('#\$\W+-q\W+\$?\w+')
1195 queue_regexp = CRegExp('#\$\W+-q\W+\$?\w+')
1197 queue_template = Unicode('#$ -q {queue}')
1196 queue_template = Unicode('#$ -q {queue}')
1198
1197
1199
1198
1200 class SGEControllerLauncher(SGELauncher, BatchClusterAppMixin):
1199 class SGEControllerLauncher(SGELauncher, BatchClusterAppMixin):
1201 """Launch a controller using SGE."""
1200 """Launch a controller using SGE."""
1202
1201
1203 batch_file_name = Unicode(u'sge_controller', config=True,
1202 batch_file_name = Unicode(u'sge_controller', config=True,
1204 help="batch file name for the ipontroller job.")
1203 help="batch file name for the ipontroller job.")
1205 default_template= Unicode(u"""#$ -V
1204 default_template= Unicode(u"""#$ -V
1206 #$ -S /bin/sh
1205 #$ -S /bin/sh
1207 #$ -N ipcontroller
1206 #$ -N ipcontroller
1208 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1207 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1209 """%(' '.join(map(pipes.quote, ipcontroller_cmd_argv))))
1208 """%(' '.join(map(pipes.quote, ipcontroller_cmd_argv))))
1210
1209
1211 def start(self):
1210 def start(self):
1212 """Start the controller by profile or profile_dir."""
1211 """Start the controller by profile or profile_dir."""
1213 return super(SGEControllerLauncher, self).start(1)
1212 return super(SGEControllerLauncher, self).start(1)
1214
1213
1215
1214
1216 class SGEEngineSetLauncher(SGELauncher, BatchClusterAppMixin):
1215 class SGEEngineSetLauncher(SGELauncher, BatchClusterAppMixin):
1217 """Launch Engines with SGE"""
1216 """Launch Engines with SGE"""
1218 batch_file_name = Unicode(u'sge_engines', config=True,
1217 batch_file_name = Unicode(u'sge_engines', config=True,
1219 help="batch file name for the engine(s) job.")
1218 help="batch file name for the engine(s) job.")
1220 default_template = Unicode("""#$ -V
1219 default_template = Unicode("""#$ -V
1221 #$ -S /bin/sh
1220 #$ -S /bin/sh
1222 #$ -N ipengine
1221 #$ -N ipengine
1223 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1222 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1224 """%(' '.join(map(pipes.quote, ipengine_cmd_argv))))
1223 """%(' '.join(map(pipes.quote, ipengine_cmd_argv))))
1225
1224
1226
1225
1227 # LSF launchers
1226 # LSF launchers
1228
1227
1229 class LSFLauncher(BatchSystemLauncher):
1228 class LSFLauncher(BatchSystemLauncher):
1230 """A BatchSystemLauncher subclass for LSF."""
1229 """A BatchSystemLauncher subclass for LSF."""
1231
1230
1232 submit_command = List(['bsub'], config=True,
1231 submit_command = List(['bsub'], config=True,
1233 help="The PBS submit command ['bsub']")
1232 help="The PBS submit command ['bsub']")
1234 delete_command = List(['bkill'], config=True,
1233 delete_command = List(['bkill'], config=True,
1235 help="The PBS delete command ['bkill']")
1234 help="The PBS delete command ['bkill']")
1236 job_id_regexp = CRegExp(r'\d+', config=True,
1235 job_id_regexp = CRegExp(r'\d+', config=True,
1237 help="Regular expresion for identifying the job ID [r'\d+']")
1236 help="Regular expresion for identifying the job ID [r'\d+']")
1238
1237
1239 batch_file = Unicode(u'')
1238 batch_file = Unicode(u'')
1240 job_array_regexp = CRegExp('#BSUB[ \t]-J+\w+\[\d+-\d+\]')
1239 job_array_regexp = CRegExp('#BSUB[ \t]-J+\w+\[\d+-\d+\]')
1241 job_array_template = Unicode('#BSUB -J ipengine[1-{n}]')
1240 job_array_template = Unicode('#BSUB -J ipengine[1-{n}]')
1242 queue_regexp = CRegExp('#BSUB[ \t]+-q[ \t]+\w+')
1241 queue_regexp = CRegExp('#BSUB[ \t]+-q[ \t]+\w+')
1243 queue_template = Unicode('#BSUB -q {queue}')
1242 queue_template = Unicode('#BSUB -q {queue}')
1244
1243
1245 def start(self, n):
1244 def start(self, n):
1246 """Start n copies of the process using LSF batch system.
1245 """Start n copies of the process using LSF batch system.
1247 This cant inherit from the base class because bsub expects
1246 This cant inherit from the base class because bsub expects
1248 to be piped a shell script in order to honor the #BSUB directives :
1247 to be piped a shell script in order to honor the #BSUB directives :
1249 bsub < script
1248 bsub < script
1250 """
1249 """
1251 # Here we save profile_dir in the context so they
1250 # Here we save profile_dir in the context so they
1252 # can be used in the batch script template as {profile_dir}
1251 # can be used in the batch script template as {profile_dir}
1253 self.write_batch_script(n)
1252 self.write_batch_script(n)
1254 piped_cmd = self.args[0]+'<\"'+self.args[1]+'\"'
1253 piped_cmd = self.args[0]+'<\"'+self.args[1]+'\"'
1255 self.log.debug("Starting %s: %s", self.__class__.__name__, piped_cmd)
1254 self.log.debug("Starting %s: %s", self.__class__.__name__, piped_cmd)
1256 p = Popen(piped_cmd, shell=True,env=os.environ,stdout=PIPE)
1255 p = Popen(piped_cmd, shell=True,env=os.environ,stdout=PIPE)
1257 output,err = p.communicate()
1256 output,err = p.communicate()
1258 output = output.decode(DEFAULT_ENCODING, 'replace')
1257 output = output.decode(DEFAULT_ENCODING, 'replace')
1259 job_id = self.parse_job_id(output)
1258 job_id = self.parse_job_id(output)
1260 self.notify_start(job_id)
1259 self.notify_start(job_id)
1261 return job_id
1260 return job_id
1262
1261
1263
1262
1264 class LSFControllerLauncher(LSFLauncher, BatchClusterAppMixin):
1263 class LSFControllerLauncher(LSFLauncher, BatchClusterAppMixin):
1265 """Launch a controller using LSF."""
1264 """Launch a controller using LSF."""
1266
1265
1267 batch_file_name = Unicode(u'lsf_controller', config=True,
1266 batch_file_name = Unicode(u'lsf_controller', config=True,
1268 help="batch file name for the controller job.")
1267 help="batch file name for the controller job.")
1269 default_template= Unicode("""#!/bin/sh
1268 default_template= Unicode("""#!/bin/sh
1270 #BSUB -J ipcontroller
1269 #BSUB -J ipcontroller
1271 #BSUB -oo ipcontroller.o.%%J
1270 #BSUB -oo ipcontroller.o.%%J
1272 #BSUB -eo ipcontroller.e.%%J
1271 #BSUB -eo ipcontroller.e.%%J
1273 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1272 %s --log-to-file --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1274 """%(' '.join(map(pipes.quote,ipcontroller_cmd_argv))))
1273 """%(' '.join(map(pipes.quote,ipcontroller_cmd_argv))))
1275
1274
1276 def start(self):
1275 def start(self):
1277 """Start the controller by profile or profile_dir."""
1276 """Start the controller by profile or profile_dir."""
1278 return super(LSFControllerLauncher, self).start(1)
1277 return super(LSFControllerLauncher, self).start(1)
1279
1278
1280
1279
1281 class LSFEngineSetLauncher(LSFLauncher, BatchClusterAppMixin):
1280 class LSFEngineSetLauncher(LSFLauncher, BatchClusterAppMixin):
1282 """Launch Engines using LSF"""
1281 """Launch Engines using LSF"""
1283 batch_file_name = Unicode(u'lsf_engines', config=True,
1282 batch_file_name = Unicode(u'lsf_engines', config=True,
1284 help="batch file name for the engine(s) job.")
1283 help="batch file name for the engine(s) job.")
1285 default_template= Unicode(u"""#!/bin/sh
1284 default_template= Unicode(u"""#!/bin/sh
1286 #BSUB -oo ipengine.o.%%J
1285 #BSUB -oo ipengine.o.%%J
1287 #BSUB -eo ipengine.e.%%J
1286 #BSUB -eo ipengine.e.%%J
1288 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1287 %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
1289 """%(' '.join(map(pipes.quote, ipengine_cmd_argv))))
1288 """%(' '.join(map(pipes.quote, ipengine_cmd_argv))))
1290
1289
1291
1290
1292
1291
1293 class HTCondorLauncher(BatchSystemLauncher):
1292 class HTCondorLauncher(BatchSystemLauncher):
1294 """A BatchSystemLauncher subclass for HTCondor.
1293 """A BatchSystemLauncher subclass for HTCondor.
1295
1294
1296 HTCondor requires that we launch the ipengine/ipcontroller scripts rather
1295 HTCondor requires that we launch the ipengine/ipcontroller scripts rather
1297 that the python instance but otherwise is very similar to PBS. This is because
1296 that the python instance but otherwise is very similar to PBS. This is because
1298 HTCondor destroys sys.executable when launching remote processes - a launched
1297 HTCondor destroys sys.executable when launching remote processes - a launched
1299 python process depends on sys.executable to effectively evaluate its
1298 python process depends on sys.executable to effectively evaluate its
1300 module search paths. Without it, regardless of which python interpreter you launch
1299 module search paths. Without it, regardless of which python interpreter you launch
1301 you will get the to built in module search paths.
1300 you will get the to built in module search paths.
1302
1301
1303 We use the ip{cluster, engine, controller} scripts as our executable to circumvent
1302 We use the ip{cluster, engine, controller} scripts as our executable to circumvent
1304 this - the mechanism of shebanged scripts means that the python binary will be
1303 this - the mechanism of shebanged scripts means that the python binary will be
1305 launched with argv[0] set to the *location of the ip{cluster, engine, controller}
1304 launched with argv[0] set to the *location of the ip{cluster, engine, controller}
1306 scripts on the remote node*. This means you need to take care that:
1305 scripts on the remote node*. This means you need to take care that:
1307
1306
1308 a. Your remote nodes have their paths configured correctly, with the ipengine and ipcontroller
1307 a. Your remote nodes have their paths configured correctly, with the ipengine and ipcontroller
1309 of the python environment you wish to execute code in having top precedence.
1308 of the python environment you wish to execute code in having top precedence.
1310 b. This functionality is untested on Windows.
1309 b. This functionality is untested on Windows.
1311
1310
1312 If you need different behavior, consider making you own template.
1311 If you need different behavior, consider making you own template.
1313 """
1312 """
1314
1313
1315 submit_command = List(['condor_submit'], config=True,
1314 submit_command = List(['condor_submit'], config=True,
1316 help="The HTCondor submit command ['condor_submit']")
1315 help="The HTCondor submit command ['condor_submit']")
1317 delete_command = List(['condor_rm'], config=True,
1316 delete_command = List(['condor_rm'], config=True,
1318 help="The HTCondor delete command ['condor_rm']")
1317 help="The HTCondor delete command ['condor_rm']")
1319 job_id_regexp = CRegExp(r'(\d+)\.$', config=True,
1318 job_id_regexp = CRegExp(r'(\d+)\.$', config=True,
1320 help="Regular expression for identifying the job ID [r'(\d+)\.$']")
1319 help="Regular expression for identifying the job ID [r'(\d+)\.$']")
1321 job_id_regexp_group = Integer(1, config=True,
1320 job_id_regexp_group = Integer(1, config=True,
1322 help="""The group we wish to match in job_id_regexp [1]""")
1321 help="""The group we wish to match in job_id_regexp [1]""")
1323
1322
1324 job_array_regexp = CRegExp('queue\W+\$')
1323 job_array_regexp = CRegExp('queue\W+\$')
1325 job_array_template = Unicode('queue {n}')
1324 job_array_template = Unicode('queue {n}')
1326
1325
1327
1326
1328 def _insert_job_array_in_script(self):
1327 def _insert_job_array_in_script(self):
1329 """Inserts a job array if required into the batch script.
1328 """Inserts a job array if required into the batch script.
1330 """
1329 """
1331 if not self.job_array_regexp.search(self.batch_template):
1330 if not self.job_array_regexp.search(self.batch_template):
1332 self.log.debug("adding job array settings to batch script")
1331 self.log.debug("adding job array settings to batch script")
1333 #HTCondor requires that the job array goes at the bottom of the script
1332 #HTCondor requires that the job array goes at the bottom of the script
1334 self.batch_template = '\n'.join([self.batch_template,
1333 self.batch_template = '\n'.join([self.batch_template,
1335 self.job_array_template])
1334 self.job_array_template])
1336
1335
1337 def _insert_queue_in_script(self):
1336 def _insert_queue_in_script(self):
1338 """AFAIK, HTCondor doesn't have a concept of multiple queues that can be
1337 """AFAIK, HTCondor doesn't have a concept of multiple queues that can be
1339 specified in the script.
1338 specified in the script.
1340 """
1339 """
1341 pass
1340 pass
1342
1341
1343
1342
1344 class HTCondorControllerLauncher(HTCondorLauncher, BatchClusterAppMixin):
1343 class HTCondorControllerLauncher(HTCondorLauncher, BatchClusterAppMixin):
1345 """Launch a controller using HTCondor."""
1344 """Launch a controller using HTCondor."""
1346
1345
1347 batch_file_name = Unicode(u'htcondor_controller', config=True,
1346 batch_file_name = Unicode(u'htcondor_controller', config=True,
1348 help="batch file name for the controller job.")
1347 help="batch file name for the controller job.")
1349 default_template = Unicode(r"""
1348 default_template = Unicode(r"""
1350 universe = vanilla
1349 universe = vanilla
1351 executable = ipcontroller
1350 executable = ipcontroller
1352 # by default we expect a shared file system
1351 # by default we expect a shared file system
1353 transfer_executable = False
1352 transfer_executable = False
1354 arguments = --log-to-file '--profile-dir={profile_dir}' --cluster-id='{cluster_id}'
1353 arguments = --log-to-file '--profile-dir={profile_dir}' --cluster-id='{cluster_id}'
1355 """)
1354 """)
1356
1355
1357 def start(self):
1356 def start(self):
1358 """Start the controller by profile or profile_dir."""
1357 """Start the controller by profile or profile_dir."""
1359 return super(HTCondorControllerLauncher, self).start(1)
1358 return super(HTCondorControllerLauncher, self).start(1)
1360
1359
1361
1360
1362 class HTCondorEngineSetLauncher(HTCondorLauncher, BatchClusterAppMixin):
1361 class HTCondorEngineSetLauncher(HTCondorLauncher, BatchClusterAppMixin):
1363 """Launch Engines using HTCondor"""
1362 """Launch Engines using HTCondor"""
1364 batch_file_name = Unicode(u'htcondor_engines', config=True,
1363 batch_file_name = Unicode(u'htcondor_engines', config=True,
1365 help="batch file name for the engine(s) job.")
1364 help="batch file name for the engine(s) job.")
1366 default_template = Unicode("""
1365 default_template = Unicode("""
1367 universe = vanilla
1366 universe = vanilla
1368 executable = ipengine
1367 executable = ipengine
1369 # by default we expect a shared file system
1368 # by default we expect a shared file system
1370 transfer_executable = False
1369 transfer_executable = False
1371 arguments = "--log-to-file '--profile-dir={profile_dir}' '--cluster-id={cluster_id}'"
1370 arguments = "--log-to-file '--profile-dir={profile_dir}' '--cluster-id={cluster_id}'"
1372 """)
1371 """)
1373
1372
1374
1373
1375 #-----------------------------------------------------------------------------
1374 #-----------------------------------------------------------------------------
1376 # A launcher for ipcluster itself!
1375 # A launcher for ipcluster itself!
1377 #-----------------------------------------------------------------------------
1376 #-----------------------------------------------------------------------------
1378
1377
1379
1378
1380 class IPClusterLauncher(LocalProcessLauncher):
1379 class IPClusterLauncher(LocalProcessLauncher):
1381 """Launch the ipcluster program in an external process."""
1380 """Launch the ipcluster program in an external process."""
1382
1381
1383 ipcluster_cmd = List(ipcluster_cmd_argv, config=True,
1382 ipcluster_cmd = List(ipcluster_cmd_argv, config=True,
1384 help="Popen command for ipcluster")
1383 help="Popen command for ipcluster")
1385 ipcluster_args = List(
1384 ipcluster_args = List(
1386 ['--clean-logs=True', '--log-to-file', '--log-level=%i'%logging.INFO], config=True,
1385 ['--clean-logs=True', '--log-to-file', '--log-level=%i'%logging.INFO], config=True,
1387 help="Command line arguments to pass to ipcluster.")
1386 help="Command line arguments to pass to ipcluster.")
1388 ipcluster_subcommand = Unicode('start')
1387 ipcluster_subcommand = Unicode('start')
1389 profile = Unicode('default')
1388 profile = Unicode('default')
1390 n = Integer(2)
1389 n = Integer(2)
1391
1390
1392 def find_args(self):
1391 def find_args(self):
1393 return self.ipcluster_cmd + [self.ipcluster_subcommand] + \
1392 return self.ipcluster_cmd + [self.ipcluster_subcommand] + \
1394 ['--n=%i'%self.n, '--profile=%s'%self.profile] + \
1393 ['--n=%i'%self.n, '--profile=%s'%self.profile] + \
1395 self.ipcluster_args
1394 self.ipcluster_args
1396
1395
1397 def start(self):
1396 def start(self):
1398 return super(IPClusterLauncher, self).start()
1397 return super(IPClusterLauncher, self).start()
1399
1398
1400 #-----------------------------------------------------------------------------
1399 #-----------------------------------------------------------------------------
1401 # Collections of launchers
1400 # Collections of launchers
1402 #-----------------------------------------------------------------------------
1401 #-----------------------------------------------------------------------------
1403
1402
1404 local_launchers = [
1403 local_launchers = [
1405 LocalControllerLauncher,
1404 LocalControllerLauncher,
1406 LocalEngineLauncher,
1405 LocalEngineLauncher,
1407 LocalEngineSetLauncher,
1406 LocalEngineSetLauncher,
1408 ]
1407 ]
1409 mpi_launchers = [
1408 mpi_launchers = [
1410 MPILauncher,
1409 MPILauncher,
1411 MPIControllerLauncher,
1410 MPIControllerLauncher,
1412 MPIEngineSetLauncher,
1411 MPIEngineSetLauncher,
1413 ]
1412 ]
1414 ssh_launchers = [
1413 ssh_launchers = [
1415 SSHLauncher,
1414 SSHLauncher,
1416 SSHControllerLauncher,
1415 SSHControllerLauncher,
1417 SSHEngineLauncher,
1416 SSHEngineLauncher,
1418 SSHEngineSetLauncher,
1417 SSHEngineSetLauncher,
1419 SSHProxyEngineSetLauncher,
1418 SSHProxyEngineSetLauncher,
1420 ]
1419 ]
1421 winhpc_launchers = [
1420 winhpc_launchers = [
1422 WindowsHPCLauncher,
1421 WindowsHPCLauncher,
1423 WindowsHPCControllerLauncher,
1422 WindowsHPCControllerLauncher,
1424 WindowsHPCEngineSetLauncher,
1423 WindowsHPCEngineSetLauncher,
1425 ]
1424 ]
1426 pbs_launchers = [
1425 pbs_launchers = [
1427 PBSLauncher,
1426 PBSLauncher,
1428 PBSControllerLauncher,
1427 PBSControllerLauncher,
1429 PBSEngineSetLauncher,
1428 PBSEngineSetLauncher,
1430 ]
1429 ]
1431 sge_launchers = [
1430 sge_launchers = [
1432 SGELauncher,
1431 SGELauncher,
1433 SGEControllerLauncher,
1432 SGEControllerLauncher,
1434 SGEEngineSetLauncher,
1433 SGEEngineSetLauncher,
1435 ]
1434 ]
1436 lsf_launchers = [
1435 lsf_launchers = [
1437 LSFLauncher,
1436 LSFLauncher,
1438 LSFControllerLauncher,
1437 LSFControllerLauncher,
1439 LSFEngineSetLauncher,
1438 LSFEngineSetLauncher,
1440 ]
1439 ]
1441 htcondor_launchers = [
1440 htcondor_launchers = [
1442 HTCondorLauncher,
1441 HTCondorLauncher,
1443 HTCondorControllerLauncher,
1442 HTCondorControllerLauncher,
1444 HTCondorEngineSetLauncher,
1443 HTCondorEngineSetLauncher,
1445 ]
1444 ]
1446 all_launchers = local_launchers + mpi_launchers + ssh_launchers + winhpc_launchers\
1445 all_launchers = local_launchers + mpi_launchers + ssh_launchers + winhpc_launchers\
1447 + pbs_launchers + sge_launchers + lsf_launchers + htcondor_launchers
1446 + pbs_launchers + sge_launchers + lsf_launchers + htcondor_launchers
@@ -1,1449 +1,1438 b''
1 """The IPython Controller Hub with 0MQ
1 """The IPython Controller Hub with 0MQ
2
2
3 This is the master object that handles connections from engines and clients,
3 This is the master object that handles connections from engines and clients,
4 and monitors traffic through the various queues.
4 and monitors traffic through the various queues.
5 """
5 """
6
6
7 # Copyright (c) IPython Development Team.
7 # Copyright (c) IPython Development Team.
8 # Distributed under the terms of the Modified BSD License.
8 # Distributed under the terms of the Modified BSD License.
9
9
10 from __future__ import print_function
10 from __future__ import print_function
11
11
12 import json
12 import json
13 import os
13 import os
14 import sys
14 import sys
15 import time
15 import time
16 from datetime import datetime
16 from datetime import datetime
17
17
18 import zmq
18 import zmq
19 from zmq.eventloop import ioloop
20 from zmq.eventloop.zmqstream import ZMQStream
19 from zmq.eventloop.zmqstream import ZMQStream
21
20
22 # internal:
21 # internal:
23 from IPython.utils.importstring import import_item
22 from IPython.utils.importstring import import_item
24 from IPython.utils.jsonutil import extract_dates
23 from IPython.utils.jsonutil import extract_dates
25 from IPython.utils.localinterfaces import localhost
24 from IPython.utils.localinterfaces import localhost
26 from IPython.utils.py3compat import cast_bytes, unicode_type, iteritems
25 from IPython.utils.py3compat import cast_bytes, unicode_type, iteritems
27 from IPython.utils.traitlets import (
26 from IPython.utils.traitlets import (
28 HasTraits, Instance, Integer, Unicode, Dict, Set, Tuple, CBytes, DottedObjectName
27 HasTraits, Any, Instance, Integer, Unicode, Dict, Set, Tuple, DottedObjectName
29 )
28 )
30
29
31 from IPython.parallel import error, util
30 from IPython.parallel import error, util
32 from IPython.parallel.factory import RegistrationFactory
31 from IPython.parallel.factory import RegistrationFactory
33
32
34 from IPython.kernel.zmq.session import SessionFactory
33 from IPython.kernel.zmq.session import SessionFactory
35
34
36 from .heartmonitor import HeartMonitor
35 from .heartmonitor import HeartMonitor
37
36
38 #-----------------------------------------------------------------------------
39 # Code
40 #-----------------------------------------------------------------------------
41
37
42 def _passer(*args, **kwargs):
38 def _passer(*args, **kwargs):
43 return
39 return
44
40
45 def _printer(*args, **kwargs):
41 def _printer(*args, **kwargs):
46 print (args)
42 print (args)
47 print (kwargs)
43 print (kwargs)
48
44
49 def empty_record():
45 def empty_record():
50 """Return an empty dict with all record keys."""
46 """Return an empty dict with all record keys."""
51 return {
47 return {
52 'msg_id' : None,
48 'msg_id' : None,
53 'header' : None,
49 'header' : None,
54 'metadata' : None,
50 'metadata' : None,
55 'content': None,
51 'content': None,
56 'buffers': None,
52 'buffers': None,
57 'submitted': None,
53 'submitted': None,
58 'client_uuid' : None,
54 'client_uuid' : None,
59 'engine_uuid' : None,
55 'engine_uuid' : None,
60 'started': None,
56 'started': None,
61 'completed': None,
57 'completed': None,
62 'resubmitted': None,
58 'resubmitted': None,
63 'received': None,
59 'received': None,
64 'result_header' : None,
60 'result_header' : None,
65 'result_metadata' : None,
61 'result_metadata' : None,
66 'result_content' : None,
62 'result_content' : None,
67 'result_buffers' : None,
63 'result_buffers' : None,
68 'queue' : None,
64 'queue' : None,
69 'execute_input' : None,
65 'execute_input' : None,
70 'execute_result': None,
66 'execute_result': None,
71 'error': None,
67 'error': None,
72 'stdout': '',
68 'stdout': '',
73 'stderr': '',
69 'stderr': '',
74 }
70 }
75
71
76 def init_record(msg):
72 def init_record(msg):
77 """Initialize a TaskRecord based on a request."""
73 """Initialize a TaskRecord based on a request."""
78 header = msg['header']
74 header = msg['header']
79 return {
75 return {
80 'msg_id' : header['msg_id'],
76 'msg_id' : header['msg_id'],
81 'header' : header,
77 'header' : header,
82 'content': msg['content'],
78 'content': msg['content'],
83 'metadata': msg['metadata'],
79 'metadata': msg['metadata'],
84 'buffers': msg['buffers'],
80 'buffers': msg['buffers'],
85 'submitted': header['date'],
81 'submitted': header['date'],
86 'client_uuid' : None,
82 'client_uuid' : None,
87 'engine_uuid' : None,
83 'engine_uuid' : None,
88 'started': None,
84 'started': None,
89 'completed': None,
85 'completed': None,
90 'resubmitted': None,
86 'resubmitted': None,
91 'received': None,
87 'received': None,
92 'result_header' : None,
88 'result_header' : None,
93 'result_metadata': None,
89 'result_metadata': None,
94 'result_content' : None,
90 'result_content' : None,
95 'result_buffers' : None,
91 'result_buffers' : None,
96 'queue' : None,
92 'queue' : None,
97 'execute_input' : None,
93 'execute_input' : None,
98 'execute_result': None,
94 'execute_result': None,
99 'error': None,
95 'error': None,
100 'stdout': '',
96 'stdout': '',
101 'stderr': '',
97 'stderr': '',
102 }
98 }
103
99
104
100
105 class EngineConnector(HasTraits):
101 class EngineConnector(HasTraits):
106 """A simple object for accessing the various zmq connections of an object.
102 """A simple object for accessing the various zmq connections of an object.
107 Attributes are:
103 Attributes are:
108 id (int): engine ID
104 id (int): engine ID
109 uuid (unicode): engine UUID
105 uuid (unicode): engine UUID
110 pending: set of msg_ids
106 pending: set of msg_ids
111 stallback: DelayedCallback for stalled registration
107 stallback: tornado timeout for stalled registration
112 """
108 """
113
109
114 id = Integer(0)
110 id = Integer(0)
115 uuid = Unicode()
111 uuid = Unicode()
116 pending = Set()
112 pending = Set()
117 stallback = Instance(ioloop.DelayedCallback)
113 stallback = Any()
118
114
119
115
120 _db_shortcuts = {
116 _db_shortcuts = {
121 'sqlitedb' : 'IPython.parallel.controller.sqlitedb.SQLiteDB',
117 'sqlitedb' : 'IPython.parallel.controller.sqlitedb.SQLiteDB',
122 'mongodb' : 'IPython.parallel.controller.mongodb.MongoDB',
118 'mongodb' : 'IPython.parallel.controller.mongodb.MongoDB',
123 'dictdb' : 'IPython.parallel.controller.dictdb.DictDB',
119 'dictdb' : 'IPython.parallel.controller.dictdb.DictDB',
124 'nodb' : 'IPython.parallel.controller.dictdb.NoDB',
120 'nodb' : 'IPython.parallel.controller.dictdb.NoDB',
125 }
121 }
126
122
127 class HubFactory(RegistrationFactory):
123 class HubFactory(RegistrationFactory):
128 """The Configurable for setting up a Hub."""
124 """The Configurable for setting up a Hub."""
129
125
130 # port-pairs for monitoredqueues:
126 # port-pairs for monitoredqueues:
131 hb = Tuple(Integer,Integer,config=True,
127 hb = Tuple(Integer,Integer,config=True,
132 help="""PUB/ROUTER Port pair for Engine heartbeats""")
128 help="""PUB/ROUTER Port pair for Engine heartbeats""")
133 def _hb_default(self):
129 def _hb_default(self):
134 return tuple(util.select_random_ports(2))
130 return tuple(util.select_random_ports(2))
135
131
136 mux = Tuple(Integer,Integer,config=True,
132 mux = Tuple(Integer,Integer,config=True,
137 help="""Client/Engine Port pair for MUX queue""")
133 help="""Client/Engine Port pair for MUX queue""")
138
134
139 def _mux_default(self):
135 def _mux_default(self):
140 return tuple(util.select_random_ports(2))
136 return tuple(util.select_random_ports(2))
141
137
142 task = Tuple(Integer,Integer,config=True,
138 task = Tuple(Integer,Integer,config=True,
143 help="""Client/Engine Port pair for Task queue""")
139 help="""Client/Engine Port pair for Task queue""")
144 def _task_default(self):
140 def _task_default(self):
145 return tuple(util.select_random_ports(2))
141 return tuple(util.select_random_ports(2))
146
142
147 control = Tuple(Integer,Integer,config=True,
143 control = Tuple(Integer,Integer,config=True,
148 help="""Client/Engine Port pair for Control queue""")
144 help="""Client/Engine Port pair for Control queue""")
149
145
150 def _control_default(self):
146 def _control_default(self):
151 return tuple(util.select_random_ports(2))
147 return tuple(util.select_random_ports(2))
152
148
153 iopub = Tuple(Integer,Integer,config=True,
149 iopub = Tuple(Integer,Integer,config=True,
154 help="""Client/Engine Port pair for IOPub relay""")
150 help="""Client/Engine Port pair for IOPub relay""")
155
151
156 def _iopub_default(self):
152 def _iopub_default(self):
157 return tuple(util.select_random_ports(2))
153 return tuple(util.select_random_ports(2))
158
154
159 # single ports:
155 # single ports:
160 mon_port = Integer(config=True,
156 mon_port = Integer(config=True,
161 help="""Monitor (SUB) port for queue traffic""")
157 help="""Monitor (SUB) port for queue traffic""")
162
158
163 def _mon_port_default(self):
159 def _mon_port_default(self):
164 return util.select_random_ports(1)[0]
160 return util.select_random_ports(1)[0]
165
161
166 notifier_port = Integer(config=True,
162 notifier_port = Integer(config=True,
167 help="""PUB port for sending engine status notifications""")
163 help="""PUB port for sending engine status notifications""")
168
164
169 def _notifier_port_default(self):
165 def _notifier_port_default(self):
170 return util.select_random_ports(1)[0]
166 return util.select_random_ports(1)[0]
171
167
172 engine_ip = Unicode(config=True,
168 engine_ip = Unicode(config=True,
173 help="IP on which to listen for engine connections. [default: loopback]")
169 help="IP on which to listen for engine connections. [default: loopback]")
174 def _engine_ip_default(self):
170 def _engine_ip_default(self):
175 return localhost()
171 return localhost()
176 engine_transport = Unicode('tcp', config=True,
172 engine_transport = Unicode('tcp', config=True,
177 help="0MQ transport for engine connections. [default: tcp]")
173 help="0MQ transport for engine connections. [default: tcp]")
178
174
179 client_ip = Unicode(config=True,
175 client_ip = Unicode(config=True,
180 help="IP on which to listen for client connections. [default: loopback]")
176 help="IP on which to listen for client connections. [default: loopback]")
181 client_transport = Unicode('tcp', config=True,
177 client_transport = Unicode('tcp', config=True,
182 help="0MQ transport for client connections. [default : tcp]")
178 help="0MQ transport for client connections. [default : tcp]")
183
179
184 monitor_ip = Unicode(config=True,
180 monitor_ip = Unicode(config=True,
185 help="IP on which to listen for monitor messages. [default: loopback]")
181 help="IP on which to listen for monitor messages. [default: loopback]")
186 monitor_transport = Unicode('tcp', config=True,
182 monitor_transport = Unicode('tcp', config=True,
187 help="0MQ transport for monitor messages. [default : tcp]")
183 help="0MQ transport for monitor messages. [default : tcp]")
188
184
189 _client_ip_default = _monitor_ip_default = _engine_ip_default
185 _client_ip_default = _monitor_ip_default = _engine_ip_default
190
186
191
187
192 monitor_url = Unicode('')
188 monitor_url = Unicode('')
193
189
194 db_class = DottedObjectName('NoDB',
190 db_class = DottedObjectName('NoDB',
195 config=True, help="""The class to use for the DB backend
191 config=True, help="""The class to use for the DB backend
196
192
197 Options include:
193 Options include:
198
194
199 SQLiteDB: SQLite
195 SQLiteDB: SQLite
200 MongoDB : use MongoDB
196 MongoDB : use MongoDB
201 DictDB : in-memory storage (fastest, but be mindful of memory growth of the Hub)
197 DictDB : in-memory storage (fastest, but be mindful of memory growth of the Hub)
202 NoDB : disable database altogether (default)
198 NoDB : disable database altogether (default)
203
199
204 """)
200 """)
205
201
206 registration_timeout = Integer(0, config=True,
202 registration_timeout = Integer(0, config=True,
207 help="Engine registration timeout in seconds [default: max(30,"
203 help="Engine registration timeout in seconds [default: max(30,"
208 "10*heartmonitor.period)]" )
204 "10*heartmonitor.period)]" )
209
205
210 def _registration_timeout_default(self):
206 def _registration_timeout_default(self):
211 if self.heartmonitor is None:
207 if self.heartmonitor is None:
212 # early initialization, this value will be ignored
208 # early initialization, this value will be ignored
213 return 0
209 return 0
214 # heartmonitor period is in milliseconds, so 10x in seconds is .01
210 # heartmonitor period is in milliseconds, so 10x in seconds is .01
215 return max(30, int(.01 * self.heartmonitor.period))
211 return max(30, int(.01 * self.heartmonitor.period))
216
212
217 # not configurable
213 # not configurable
218 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
214 db = Instance('IPython.parallel.controller.dictdb.BaseDB')
219 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
215 heartmonitor = Instance('IPython.parallel.controller.heartmonitor.HeartMonitor')
220
216
221 def _ip_changed(self, name, old, new):
217 def _ip_changed(self, name, old, new):
222 self.engine_ip = new
218 self.engine_ip = new
223 self.client_ip = new
219 self.client_ip = new
224 self.monitor_ip = new
220 self.monitor_ip = new
225 self._update_monitor_url()
221 self._update_monitor_url()
226
222
227 def _update_monitor_url(self):
223 def _update_monitor_url(self):
228 self.monitor_url = "%s://%s:%i" % (self.monitor_transport, self.monitor_ip, self.mon_port)
224 self.monitor_url = "%s://%s:%i" % (self.monitor_transport, self.monitor_ip, self.mon_port)
229
225
230 def _transport_changed(self, name, old, new):
226 def _transport_changed(self, name, old, new):
231 self.engine_transport = new
227 self.engine_transport = new
232 self.client_transport = new
228 self.client_transport = new
233 self.monitor_transport = new
229 self.monitor_transport = new
234 self._update_monitor_url()
230 self._update_monitor_url()
235
231
236 def __init__(self, **kwargs):
232 def __init__(self, **kwargs):
237 super(HubFactory, self).__init__(**kwargs)
233 super(HubFactory, self).__init__(**kwargs)
238 self._update_monitor_url()
234 self._update_monitor_url()
239
235
240
236
241 def construct(self):
237 def construct(self):
242 self.init_hub()
238 self.init_hub()
243
239
244 def start(self):
240 def start(self):
245 self.heartmonitor.start()
241 self.heartmonitor.start()
246 self.log.info("Heartmonitor started")
242 self.log.info("Heartmonitor started")
247
243
248 def client_url(self, channel):
244 def client_url(self, channel):
249 """return full zmq url for a named client channel"""
245 """return full zmq url for a named client channel"""
250 return "%s://%s:%i" % (self.client_transport, self.client_ip, self.client_info[channel])
246 return "%s://%s:%i" % (self.client_transport, self.client_ip, self.client_info[channel])
251
247
252 def engine_url(self, channel):
248 def engine_url(self, channel):
253 """return full zmq url for a named engine channel"""
249 """return full zmq url for a named engine channel"""
254 return "%s://%s:%i" % (self.engine_transport, self.engine_ip, self.engine_info[channel])
250 return "%s://%s:%i" % (self.engine_transport, self.engine_ip, self.engine_info[channel])
255
251
256 def init_hub(self):
252 def init_hub(self):
257 """construct Hub object"""
253 """construct Hub object"""
258
254
259 ctx = self.context
255 ctx = self.context
260 loop = self.loop
256 loop = self.loop
261 if 'TaskScheduler.scheme_name' in self.config:
257 if 'TaskScheduler.scheme_name' in self.config:
262 scheme = self.config.TaskScheduler.scheme_name
258 scheme = self.config.TaskScheduler.scheme_name
263 else:
259 else:
264 from .scheduler import TaskScheduler
260 from .scheduler import TaskScheduler
265 scheme = TaskScheduler.scheme_name.get_default_value()
261 scheme = TaskScheduler.scheme_name.get_default_value()
266
262
267 # build connection dicts
263 # build connection dicts
268 engine = self.engine_info = {
264 engine = self.engine_info = {
269 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip),
265 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip),
270 'registration' : self.regport,
266 'registration' : self.regport,
271 'control' : self.control[1],
267 'control' : self.control[1],
272 'mux' : self.mux[1],
268 'mux' : self.mux[1],
273 'hb_ping' : self.hb[0],
269 'hb_ping' : self.hb[0],
274 'hb_pong' : self.hb[1],
270 'hb_pong' : self.hb[1],
275 'task' : self.task[1],
271 'task' : self.task[1],
276 'iopub' : self.iopub[1],
272 'iopub' : self.iopub[1],
277 }
273 }
278
274
279 client = self.client_info = {
275 client = self.client_info = {
280 'interface' : "%s://%s" % (self.client_transport, self.client_ip),
276 'interface' : "%s://%s" % (self.client_transport, self.client_ip),
281 'registration' : self.regport,
277 'registration' : self.regport,
282 'control' : self.control[0],
278 'control' : self.control[0],
283 'mux' : self.mux[0],
279 'mux' : self.mux[0],
284 'task' : self.task[0],
280 'task' : self.task[0],
285 'task_scheme' : scheme,
281 'task_scheme' : scheme,
286 'iopub' : self.iopub[0],
282 'iopub' : self.iopub[0],
287 'notification' : self.notifier_port,
283 'notification' : self.notifier_port,
288 }
284 }
289
285
290 self.log.debug("Hub engine addrs: %s", self.engine_info)
286 self.log.debug("Hub engine addrs: %s", self.engine_info)
291 self.log.debug("Hub client addrs: %s", self.client_info)
287 self.log.debug("Hub client addrs: %s", self.client_info)
292
288
293 # Registrar socket
289 # Registrar socket
294 q = ZMQStream(ctx.socket(zmq.ROUTER), loop)
290 q = ZMQStream(ctx.socket(zmq.ROUTER), loop)
295 util.set_hwm(q, 0)
291 util.set_hwm(q, 0)
296 q.bind(self.client_url('registration'))
292 q.bind(self.client_url('registration'))
297 self.log.info("Hub listening on %s for registration.", self.client_url('registration'))
293 self.log.info("Hub listening on %s for registration.", self.client_url('registration'))
298 if self.client_ip != self.engine_ip:
294 if self.client_ip != self.engine_ip:
299 q.bind(self.engine_url('registration'))
295 q.bind(self.engine_url('registration'))
300 self.log.info("Hub listening on %s for registration.", self.engine_url('registration'))
296 self.log.info("Hub listening on %s for registration.", self.engine_url('registration'))
301
297
302 ### Engine connections ###
298 ### Engine connections ###
303
299
304 # heartbeat
300 # heartbeat
305 hpub = ctx.socket(zmq.PUB)
301 hpub = ctx.socket(zmq.PUB)
306 hpub.bind(self.engine_url('hb_ping'))
302 hpub.bind(self.engine_url('hb_ping'))
307 hrep = ctx.socket(zmq.ROUTER)
303 hrep = ctx.socket(zmq.ROUTER)
308 util.set_hwm(hrep, 0)
304 util.set_hwm(hrep, 0)
309 hrep.bind(self.engine_url('hb_pong'))
305 hrep.bind(self.engine_url('hb_pong'))
310 self.heartmonitor = HeartMonitor(loop=loop, parent=self, log=self.log,
306 self.heartmonitor = HeartMonitor(loop=loop, parent=self, log=self.log,
311 pingstream=ZMQStream(hpub,loop),
307 pingstream=ZMQStream(hpub,loop),
312 pongstream=ZMQStream(hrep,loop)
308 pongstream=ZMQStream(hrep,loop)
313 )
309 )
314
310
315 ### Client connections ###
311 ### Client connections ###
316
312
317 # Notifier socket
313 # Notifier socket
318 n = ZMQStream(ctx.socket(zmq.PUB), loop)
314 n = ZMQStream(ctx.socket(zmq.PUB), loop)
319 n.bind(self.client_url('notification'))
315 n.bind(self.client_url('notification'))
320
316
321 ### build and launch the queues ###
317 ### build and launch the queues ###
322
318
323 # monitor socket
319 # monitor socket
324 sub = ctx.socket(zmq.SUB)
320 sub = ctx.socket(zmq.SUB)
325 sub.setsockopt(zmq.SUBSCRIBE, b"")
321 sub.setsockopt(zmq.SUBSCRIBE, b"")
326 sub.bind(self.monitor_url)
322 sub.bind(self.monitor_url)
327 sub.bind('inproc://monitor')
323 sub.bind('inproc://monitor')
328 sub = ZMQStream(sub, loop)
324 sub = ZMQStream(sub, loop)
329
325
330 # connect the db
326 # connect the db
331 db_class = _db_shortcuts.get(self.db_class.lower(), self.db_class)
327 db_class = _db_shortcuts.get(self.db_class.lower(), self.db_class)
332 self.log.info('Hub using DB backend: %r', (db_class.split('.')[-1]))
328 self.log.info('Hub using DB backend: %r', (db_class.split('.')[-1]))
333 self.db = import_item(str(db_class))(session=self.session.session,
329 self.db = import_item(str(db_class))(session=self.session.session,
334 parent=self, log=self.log)
330 parent=self, log=self.log)
335 time.sleep(.25)
331 time.sleep(.25)
336
332
337 # resubmit stream
333 # resubmit stream
338 r = ZMQStream(ctx.socket(zmq.DEALER), loop)
334 r = ZMQStream(ctx.socket(zmq.DEALER), loop)
339 url = util.disambiguate_url(self.client_url('task'))
335 url = util.disambiguate_url(self.client_url('task'))
340 r.connect(url)
336 r.connect(url)
341
337
342 # convert seconds to msec
343 registration_timeout = 1000*self.registration_timeout
344
345 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
338 self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
346 query=q, notifier=n, resubmit=r, db=self.db,
339 query=q, notifier=n, resubmit=r, db=self.db,
347 engine_info=self.engine_info, client_info=self.client_info,
340 engine_info=self.engine_info, client_info=self.client_info,
348 log=self.log, registration_timeout=registration_timeout)
341 log=self.log, registration_timeout=self.registration_timeout)
349
342
350
343
351 class Hub(SessionFactory):
344 class Hub(SessionFactory):
352 """The IPython Controller Hub with 0MQ connections
345 """The IPython Controller Hub with 0MQ connections
353
346
354 Parameters
347 Parameters
355 ==========
348 ==========
356 loop: zmq IOLoop instance
349 loop: zmq IOLoop instance
357 session: Session object
350 session: Session object
358 <removed> context: zmq context for creating new connections (?)
351 <removed> context: zmq context for creating new connections (?)
359 queue: ZMQStream for monitoring the command queue (SUB)
352 queue: ZMQStream for monitoring the command queue (SUB)
360 query: ZMQStream for engine registration and client queries requests (ROUTER)
353 query: ZMQStream for engine registration and client queries requests (ROUTER)
361 heartbeat: HeartMonitor object checking the pulse of the engines
354 heartbeat: HeartMonitor object checking the pulse of the engines
362 notifier: ZMQStream for broadcasting engine registration changes (PUB)
355 notifier: ZMQStream for broadcasting engine registration changes (PUB)
363 db: connection to db for out of memory logging of commands
356 db: connection to db for out of memory logging of commands
364 NotImplemented
357 NotImplemented
365 engine_info: dict of zmq connection information for engines to connect
358 engine_info: dict of zmq connection information for engines to connect
366 to the queues.
359 to the queues.
367 client_info: dict of zmq connection information for engines to connect
360 client_info: dict of zmq connection information for engines to connect
368 to the queues.
361 to the queues.
369 """
362 """
370
363
371 engine_state_file = Unicode()
364 engine_state_file = Unicode()
372
365
373 # internal data structures:
366 # internal data structures:
374 ids=Set() # engine IDs
367 ids=Set() # engine IDs
375 keytable=Dict()
368 keytable=Dict()
376 by_ident=Dict()
369 by_ident=Dict()
377 engines=Dict()
370 engines=Dict()
378 clients=Dict()
371 clients=Dict()
379 hearts=Dict()
372 hearts=Dict()
380 pending=Set()
373 pending=Set()
381 queues=Dict() # pending msg_ids keyed by engine_id
374 queues=Dict() # pending msg_ids keyed by engine_id
382 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
375 tasks=Dict() # pending msg_ids submitted as tasks, keyed by client_id
383 completed=Dict() # completed msg_ids keyed by engine_id
376 completed=Dict() # completed msg_ids keyed by engine_id
384 all_completed=Set() # completed msg_ids keyed by engine_id
377 all_completed=Set() # completed msg_ids keyed by engine_id
385 dead_engines=Set() # completed msg_ids keyed by engine_id
378 dead_engines=Set() # completed msg_ids keyed by engine_id
386 unassigned=Set() # set of task msg_ds not yet assigned a destination
379 unassigned=Set() # set of task msg_ds not yet assigned a destination
387 incoming_registrations=Dict()
380 incoming_registrations=Dict()
388 registration_timeout=Integer()
381 registration_timeout=Integer()
389 _idcounter=Integer(0)
382 _idcounter=Integer(0)
390
383
391 # objects from constructor:
384 # objects from constructor:
392 query=Instance(ZMQStream)
385 query=Instance(ZMQStream)
393 monitor=Instance(ZMQStream)
386 monitor=Instance(ZMQStream)
394 notifier=Instance(ZMQStream)
387 notifier=Instance(ZMQStream)
395 resubmit=Instance(ZMQStream)
388 resubmit=Instance(ZMQStream)
396 heartmonitor=Instance(HeartMonitor)
389 heartmonitor=Instance(HeartMonitor)
397 db=Instance(object)
390 db=Instance(object)
398 client_info=Dict()
391 client_info=Dict()
399 engine_info=Dict()
392 engine_info=Dict()
400
393
401
394
402 def __init__(self, **kwargs):
395 def __init__(self, **kwargs):
403 """
396 """
404 # universal:
397 # universal:
405 loop: IOLoop for creating future connections
398 loop: IOLoop for creating future connections
406 session: streamsession for sending serialized data
399 session: streamsession for sending serialized data
407 # engine:
400 # engine:
408 queue: ZMQStream for monitoring queue messages
401 queue: ZMQStream for monitoring queue messages
409 query: ZMQStream for engine+client registration and client requests
402 query: ZMQStream for engine+client registration and client requests
410 heartbeat: HeartMonitor object for tracking engines
403 heartbeat: HeartMonitor object for tracking engines
411 # extra:
404 # extra:
412 db: ZMQStream for db connection (NotImplemented)
405 db: ZMQStream for db connection (NotImplemented)
413 engine_info: zmq address/protocol dict for engine connections
406 engine_info: zmq address/protocol dict for engine connections
414 client_info: zmq address/protocol dict for client connections
407 client_info: zmq address/protocol dict for client connections
415 """
408 """
416
409
417 super(Hub, self).__init__(**kwargs)
410 super(Hub, self).__init__(**kwargs)
418
411
419 # register our callbacks
412 # register our callbacks
420 self.query.on_recv(self.dispatch_query)
413 self.query.on_recv(self.dispatch_query)
421 self.monitor.on_recv(self.dispatch_monitor_traffic)
414 self.monitor.on_recv(self.dispatch_monitor_traffic)
422
415
423 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
416 self.heartmonitor.add_heart_failure_handler(self.handle_heart_failure)
424 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
417 self.heartmonitor.add_new_heart_handler(self.handle_new_heart)
425
418
426 self.monitor_handlers = {b'in' : self.save_queue_request,
419 self.monitor_handlers = {b'in' : self.save_queue_request,
427 b'out': self.save_queue_result,
420 b'out': self.save_queue_result,
428 b'intask': self.save_task_request,
421 b'intask': self.save_task_request,
429 b'outtask': self.save_task_result,
422 b'outtask': self.save_task_result,
430 b'tracktask': self.save_task_destination,
423 b'tracktask': self.save_task_destination,
431 b'incontrol': _passer,
424 b'incontrol': _passer,
432 b'outcontrol': _passer,
425 b'outcontrol': _passer,
433 b'iopub': self.save_iopub_message,
426 b'iopub': self.save_iopub_message,
434 }
427 }
435
428
436 self.query_handlers = {'queue_request': self.queue_status,
429 self.query_handlers = {'queue_request': self.queue_status,
437 'result_request': self.get_results,
430 'result_request': self.get_results,
438 'history_request': self.get_history,
431 'history_request': self.get_history,
439 'db_request': self.db_query,
432 'db_request': self.db_query,
440 'purge_request': self.purge_results,
433 'purge_request': self.purge_results,
441 'load_request': self.check_load,
434 'load_request': self.check_load,
442 'resubmit_request': self.resubmit_task,
435 'resubmit_request': self.resubmit_task,
443 'shutdown_request': self.shutdown_request,
436 'shutdown_request': self.shutdown_request,
444 'registration_request' : self.register_engine,
437 'registration_request' : self.register_engine,
445 'unregistration_request' : self.unregister_engine,
438 'unregistration_request' : self.unregister_engine,
446 'connection_request': self.connection_request,
439 'connection_request': self.connection_request,
447 }
440 }
448
441
449 # ignore resubmit replies
442 # ignore resubmit replies
450 self.resubmit.on_recv(lambda msg: None, copy=False)
443 self.resubmit.on_recv(lambda msg: None, copy=False)
451
444
452 self.log.info("hub::created hub")
445 self.log.info("hub::created hub")
453
446
454 @property
447 @property
455 def _next_id(self):
448 def _next_id(self):
456 """gemerate a new ID.
449 """gemerate a new ID.
457
450
458 No longer reuse old ids, just count from 0."""
451 No longer reuse old ids, just count from 0."""
459 newid = self._idcounter
452 newid = self._idcounter
460 self._idcounter += 1
453 self._idcounter += 1
461 return newid
454 return newid
462 # newid = 0
455 # newid = 0
463 # incoming = [id[0] for id in itervalues(self.incoming_registrations)]
456 # incoming = [id[0] for id in itervalues(self.incoming_registrations)]
464 # # print newid, self.ids, self.incoming_registrations
457 # # print newid, self.ids, self.incoming_registrations
465 # while newid in self.ids or newid in incoming:
458 # while newid in self.ids or newid in incoming:
466 # newid += 1
459 # newid += 1
467 # return newid
460 # return newid
468
461
469 #-----------------------------------------------------------------------------
462 #-----------------------------------------------------------------------------
470 # message validation
463 # message validation
471 #-----------------------------------------------------------------------------
464 #-----------------------------------------------------------------------------
472
465
473 def _validate_targets(self, targets):
466 def _validate_targets(self, targets):
474 """turn any valid targets argument into a list of integer ids"""
467 """turn any valid targets argument into a list of integer ids"""
475 if targets is None:
468 if targets is None:
476 # default to all
469 # default to all
477 return self.ids
470 return self.ids
478
471
479 if isinstance(targets, (int,str,unicode_type)):
472 if isinstance(targets, (int,str,unicode_type)):
480 # only one target specified
473 # only one target specified
481 targets = [targets]
474 targets = [targets]
482 _targets = []
475 _targets = []
483 for t in targets:
476 for t in targets:
484 # map raw identities to ids
477 # map raw identities to ids
485 if isinstance(t, (str,unicode_type)):
478 if isinstance(t, (str,unicode_type)):
486 t = self.by_ident.get(cast_bytes(t), t)
479 t = self.by_ident.get(cast_bytes(t), t)
487 _targets.append(t)
480 _targets.append(t)
488 targets = _targets
481 targets = _targets
489 bad_targets = [ t for t in targets if t not in self.ids ]
482 bad_targets = [ t for t in targets if t not in self.ids ]
490 if bad_targets:
483 if bad_targets:
491 raise IndexError("No Such Engine: %r" % bad_targets)
484 raise IndexError("No Such Engine: %r" % bad_targets)
492 if not targets:
485 if not targets:
493 raise IndexError("No Engines Registered")
486 raise IndexError("No Engines Registered")
494 return targets
487 return targets
495
488
496 #-----------------------------------------------------------------------------
489 #-----------------------------------------------------------------------------
497 # dispatch methods (1 per stream)
490 # dispatch methods (1 per stream)
498 #-----------------------------------------------------------------------------
491 #-----------------------------------------------------------------------------
499
492
500
493
501 @util.log_errors
494 @util.log_errors
502 def dispatch_monitor_traffic(self, msg):
495 def dispatch_monitor_traffic(self, msg):
503 """all ME and Task queue messages come through here, as well as
496 """all ME and Task queue messages come through here, as well as
504 IOPub traffic."""
497 IOPub traffic."""
505 self.log.debug("monitor traffic: %r", msg[0])
498 self.log.debug("monitor traffic: %r", msg[0])
506 switch = msg[0]
499 switch = msg[0]
507 try:
500 try:
508 idents, msg = self.session.feed_identities(msg[1:])
501 idents, msg = self.session.feed_identities(msg[1:])
509 except ValueError:
502 except ValueError:
510 idents=[]
503 idents=[]
511 if not idents:
504 if not idents:
512 self.log.error("Monitor message without topic: %r", msg)
505 self.log.error("Monitor message without topic: %r", msg)
513 return
506 return
514 handler = self.monitor_handlers.get(switch, None)
507 handler = self.monitor_handlers.get(switch, None)
515 if handler is not None:
508 if handler is not None:
516 handler(idents, msg)
509 handler(idents, msg)
517 else:
510 else:
518 self.log.error("Unrecognized monitor topic: %r", switch)
511 self.log.error("Unrecognized monitor topic: %r", switch)
519
512
520
513
521 @util.log_errors
514 @util.log_errors
522 def dispatch_query(self, msg):
515 def dispatch_query(self, msg):
523 """Route registration requests and queries from clients."""
516 """Route registration requests and queries from clients."""
524 try:
517 try:
525 idents, msg = self.session.feed_identities(msg)
518 idents, msg = self.session.feed_identities(msg)
526 except ValueError:
519 except ValueError:
527 idents = []
520 idents = []
528 if not idents:
521 if not idents:
529 self.log.error("Bad Query Message: %r", msg)
522 self.log.error("Bad Query Message: %r", msg)
530 return
523 return
531 client_id = idents[0]
524 client_id = idents[0]
532 try:
525 try:
533 msg = self.session.unserialize(msg, content=True)
526 msg = self.session.unserialize(msg, content=True)
534 except Exception:
527 except Exception:
535 content = error.wrap_exception()
528 content = error.wrap_exception()
536 self.log.error("Bad Query Message: %r", msg, exc_info=True)
529 self.log.error("Bad Query Message: %r", msg, exc_info=True)
537 self.session.send(self.query, "hub_error", ident=client_id,
530 self.session.send(self.query, "hub_error", ident=client_id,
538 content=content)
531 content=content)
539 return
532 return
540 # print client_id, header, parent, content
533 # print client_id, header, parent, content
541 #switch on message type:
534 #switch on message type:
542 msg_type = msg['header']['msg_type']
535 msg_type = msg['header']['msg_type']
543 self.log.info("client::client %r requested %r", client_id, msg_type)
536 self.log.info("client::client %r requested %r", client_id, msg_type)
544 handler = self.query_handlers.get(msg_type, None)
537 handler = self.query_handlers.get(msg_type, None)
545 try:
538 try:
546 assert handler is not None, "Bad Message Type: %r" % msg_type
539 assert handler is not None, "Bad Message Type: %r" % msg_type
547 except:
540 except:
548 content = error.wrap_exception()
541 content = error.wrap_exception()
549 self.log.error("Bad Message Type: %r", msg_type, exc_info=True)
542 self.log.error("Bad Message Type: %r", msg_type, exc_info=True)
550 self.session.send(self.query, "hub_error", ident=client_id,
543 self.session.send(self.query, "hub_error", ident=client_id,
551 content=content)
544 content=content)
552 return
545 return
553
546
554 else:
547 else:
555 handler(idents, msg)
548 handler(idents, msg)
556
549
557 def dispatch_db(self, msg):
550 def dispatch_db(self, msg):
558 """"""
551 """"""
559 raise NotImplementedError
552 raise NotImplementedError
560
553
561 #---------------------------------------------------------------------------
554 #---------------------------------------------------------------------------
562 # handler methods (1 per event)
555 # handler methods (1 per event)
563 #---------------------------------------------------------------------------
556 #---------------------------------------------------------------------------
564
557
565 #----------------------- Heartbeat --------------------------------------
558 #----------------------- Heartbeat --------------------------------------
566
559
567 def handle_new_heart(self, heart):
560 def handle_new_heart(self, heart):
568 """handler to attach to heartbeater.
561 """handler to attach to heartbeater.
569 Called when a new heart starts to beat.
562 Called when a new heart starts to beat.
570 Triggers completion of registration."""
563 Triggers completion of registration."""
571 self.log.debug("heartbeat::handle_new_heart(%r)", heart)
564 self.log.debug("heartbeat::handle_new_heart(%r)", heart)
572 if heart not in self.incoming_registrations:
565 if heart not in self.incoming_registrations:
573 self.log.info("heartbeat::ignoring new heart: %r", heart)
566 self.log.info("heartbeat::ignoring new heart: %r", heart)
574 else:
567 else:
575 self.finish_registration(heart)
568 self.finish_registration(heart)
576
569
577
570
578 def handle_heart_failure(self, heart):
571 def handle_heart_failure(self, heart):
579 """handler to attach to heartbeater.
572 """handler to attach to heartbeater.
580 called when a previously registered heart fails to respond to beat request.
573 called when a previously registered heart fails to respond to beat request.
581 triggers unregistration"""
574 triggers unregistration"""
582 self.log.debug("heartbeat::handle_heart_failure(%r)", heart)
575 self.log.debug("heartbeat::handle_heart_failure(%r)", heart)
583 eid = self.hearts.get(heart, None)
576 eid = self.hearts.get(heart, None)
584 uuid = self.engines[eid].uuid
577 uuid = self.engines[eid].uuid
585 if eid is None or self.keytable[eid] in self.dead_engines:
578 if eid is None or self.keytable[eid] in self.dead_engines:
586 self.log.info("heartbeat::ignoring heart failure %r (not an engine or already dead)", heart)
579 self.log.info("heartbeat::ignoring heart failure %r (not an engine or already dead)", heart)
587 else:
580 else:
588 self.unregister_engine(heart, dict(content=dict(id=eid, queue=uuid)))
581 self.unregister_engine(heart, dict(content=dict(id=eid, queue=uuid)))
589
582
590 #----------------------- MUX Queue Traffic ------------------------------
583 #----------------------- MUX Queue Traffic ------------------------------
591
584
592 def save_queue_request(self, idents, msg):
585 def save_queue_request(self, idents, msg):
593 if len(idents) < 2:
586 if len(idents) < 2:
594 self.log.error("invalid identity prefix: %r", idents)
587 self.log.error("invalid identity prefix: %r", idents)
595 return
588 return
596 queue_id, client_id = idents[:2]
589 queue_id, client_id = idents[:2]
597 try:
590 try:
598 msg = self.session.unserialize(msg)
591 msg = self.session.unserialize(msg)
599 except Exception:
592 except Exception:
600 self.log.error("queue::client %r sent invalid message to %r: %r", client_id, queue_id, msg, exc_info=True)
593 self.log.error("queue::client %r sent invalid message to %r: %r", client_id, queue_id, msg, exc_info=True)
601 return
594 return
602
595
603 eid = self.by_ident.get(queue_id, None)
596 eid = self.by_ident.get(queue_id, None)
604 if eid is None:
597 if eid is None:
605 self.log.error("queue::target %r not registered", queue_id)
598 self.log.error("queue::target %r not registered", queue_id)
606 self.log.debug("queue:: valid are: %r", self.by_ident.keys())
599 self.log.debug("queue:: valid are: %r", self.by_ident.keys())
607 return
600 return
608 record = init_record(msg)
601 record = init_record(msg)
609 msg_id = record['msg_id']
602 msg_id = record['msg_id']
610 self.log.info("queue::client %r submitted request %r to %s", client_id, msg_id, eid)
603 self.log.info("queue::client %r submitted request %r to %s", client_id, msg_id, eid)
611 # Unicode in records
604 # Unicode in records
612 record['engine_uuid'] = queue_id.decode('ascii')
605 record['engine_uuid'] = queue_id.decode('ascii')
613 record['client_uuid'] = msg['header']['session']
606 record['client_uuid'] = msg['header']['session']
614 record['queue'] = 'mux'
607 record['queue'] = 'mux'
615
608
616 try:
609 try:
617 # it's posible iopub arrived first:
610 # it's posible iopub arrived first:
618 existing = self.db.get_record(msg_id)
611 existing = self.db.get_record(msg_id)
619 for key,evalue in iteritems(existing):
612 for key,evalue in iteritems(existing):
620 rvalue = record.get(key, None)
613 rvalue = record.get(key, None)
621 if evalue and rvalue and evalue != rvalue:
614 if evalue and rvalue and evalue != rvalue:
622 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
615 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
623 elif evalue and not rvalue:
616 elif evalue and not rvalue:
624 record[key] = evalue
617 record[key] = evalue
625 try:
618 try:
626 self.db.update_record(msg_id, record)
619 self.db.update_record(msg_id, record)
627 except Exception:
620 except Exception:
628 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
621 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
629 except KeyError:
622 except KeyError:
630 try:
623 try:
631 self.db.add_record(msg_id, record)
624 self.db.add_record(msg_id, record)
632 except Exception:
625 except Exception:
633 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
626 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
634
627
635
628
636 self.pending.add(msg_id)
629 self.pending.add(msg_id)
637 self.queues[eid].append(msg_id)
630 self.queues[eid].append(msg_id)
638
631
639 def save_queue_result(self, idents, msg):
632 def save_queue_result(self, idents, msg):
640 if len(idents) < 2:
633 if len(idents) < 2:
641 self.log.error("invalid identity prefix: %r", idents)
634 self.log.error("invalid identity prefix: %r", idents)
642 return
635 return
643
636
644 client_id, queue_id = idents[:2]
637 client_id, queue_id = idents[:2]
645 try:
638 try:
646 msg = self.session.unserialize(msg)
639 msg = self.session.unserialize(msg)
647 except Exception:
640 except Exception:
648 self.log.error("queue::engine %r sent invalid message to %r: %r",
641 self.log.error("queue::engine %r sent invalid message to %r: %r",
649 queue_id, client_id, msg, exc_info=True)
642 queue_id, client_id, msg, exc_info=True)
650 return
643 return
651
644
652 eid = self.by_ident.get(queue_id, None)
645 eid = self.by_ident.get(queue_id, None)
653 if eid is None:
646 if eid is None:
654 self.log.error("queue::unknown engine %r is sending a reply: ", queue_id)
647 self.log.error("queue::unknown engine %r is sending a reply: ", queue_id)
655 return
648 return
656
649
657 parent = msg['parent_header']
650 parent = msg['parent_header']
658 if not parent:
651 if not parent:
659 return
652 return
660 msg_id = parent['msg_id']
653 msg_id = parent['msg_id']
661 if msg_id in self.pending:
654 if msg_id in self.pending:
662 self.pending.remove(msg_id)
655 self.pending.remove(msg_id)
663 self.all_completed.add(msg_id)
656 self.all_completed.add(msg_id)
664 self.queues[eid].remove(msg_id)
657 self.queues[eid].remove(msg_id)
665 self.completed[eid].append(msg_id)
658 self.completed[eid].append(msg_id)
666 self.log.info("queue::request %r completed on %s", msg_id, eid)
659 self.log.info("queue::request %r completed on %s", msg_id, eid)
667 elif msg_id not in self.all_completed:
660 elif msg_id not in self.all_completed:
668 # it could be a result from a dead engine that died before delivering the
661 # it could be a result from a dead engine that died before delivering the
669 # result
662 # result
670 self.log.warn("queue:: unknown msg finished %r", msg_id)
663 self.log.warn("queue:: unknown msg finished %r", msg_id)
671 return
664 return
672 # update record anyway, because the unregistration could have been premature
665 # update record anyway, because the unregistration could have been premature
673 rheader = msg['header']
666 rheader = msg['header']
674 md = msg['metadata']
667 md = msg['metadata']
675 completed = rheader['date']
668 completed = rheader['date']
676 started = extract_dates(md.get('started', None))
669 started = extract_dates(md.get('started', None))
677 result = {
670 result = {
678 'result_header' : rheader,
671 'result_header' : rheader,
679 'result_metadata': md,
672 'result_metadata': md,
680 'result_content': msg['content'],
673 'result_content': msg['content'],
681 'received': datetime.now(),
674 'received': datetime.now(),
682 'started' : started,
675 'started' : started,
683 'completed' : completed
676 'completed' : completed
684 }
677 }
685
678
686 result['result_buffers'] = msg['buffers']
679 result['result_buffers'] = msg['buffers']
687 try:
680 try:
688 self.db.update_record(msg_id, result)
681 self.db.update_record(msg_id, result)
689 except Exception:
682 except Exception:
690 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
683 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
691
684
692
685
693 #--------------------- Task Queue Traffic ------------------------------
686 #--------------------- Task Queue Traffic ------------------------------
694
687
695 def save_task_request(self, idents, msg):
688 def save_task_request(self, idents, msg):
696 """Save the submission of a task."""
689 """Save the submission of a task."""
697 client_id = idents[0]
690 client_id = idents[0]
698
691
699 try:
692 try:
700 msg = self.session.unserialize(msg)
693 msg = self.session.unserialize(msg)
701 except Exception:
694 except Exception:
702 self.log.error("task::client %r sent invalid task message: %r",
695 self.log.error("task::client %r sent invalid task message: %r",
703 client_id, msg, exc_info=True)
696 client_id, msg, exc_info=True)
704 return
697 return
705 record = init_record(msg)
698 record = init_record(msg)
706
699
707 record['client_uuid'] = msg['header']['session']
700 record['client_uuid'] = msg['header']['session']
708 record['queue'] = 'task'
701 record['queue'] = 'task'
709 header = msg['header']
702 header = msg['header']
710 msg_id = header['msg_id']
703 msg_id = header['msg_id']
711 self.pending.add(msg_id)
704 self.pending.add(msg_id)
712 self.unassigned.add(msg_id)
705 self.unassigned.add(msg_id)
713 try:
706 try:
714 # it's posible iopub arrived first:
707 # it's posible iopub arrived first:
715 existing = self.db.get_record(msg_id)
708 existing = self.db.get_record(msg_id)
716 if existing['resubmitted']:
709 if existing['resubmitted']:
717 for key in ('submitted', 'client_uuid', 'buffers'):
710 for key in ('submitted', 'client_uuid', 'buffers'):
718 # don't clobber these keys on resubmit
711 # don't clobber these keys on resubmit
719 # submitted and client_uuid should be different
712 # submitted and client_uuid should be different
720 # and buffers might be big, and shouldn't have changed
713 # and buffers might be big, and shouldn't have changed
721 record.pop(key)
714 record.pop(key)
722 # still check content,header which should not change
715 # still check content,header which should not change
723 # but are not expensive to compare as buffers
716 # but are not expensive to compare as buffers
724
717
725 for key,evalue in iteritems(existing):
718 for key,evalue in iteritems(existing):
726 if key.endswith('buffers'):
719 if key.endswith('buffers'):
727 # don't compare buffers
720 # don't compare buffers
728 continue
721 continue
729 rvalue = record.get(key, None)
722 rvalue = record.get(key, None)
730 if evalue and rvalue and evalue != rvalue:
723 if evalue and rvalue and evalue != rvalue:
731 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
724 self.log.warn("conflicting initial state for record: %r:%r <%r> %r", msg_id, rvalue, key, evalue)
732 elif evalue and not rvalue:
725 elif evalue and not rvalue:
733 record[key] = evalue
726 record[key] = evalue
734 try:
727 try:
735 self.db.update_record(msg_id, record)
728 self.db.update_record(msg_id, record)
736 except Exception:
729 except Exception:
737 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
730 self.log.error("DB Error updating record %r", msg_id, exc_info=True)
738 except KeyError:
731 except KeyError:
739 try:
732 try:
740 self.db.add_record(msg_id, record)
733 self.db.add_record(msg_id, record)
741 except Exception:
734 except Exception:
742 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
735 self.log.error("DB Error adding record %r", msg_id, exc_info=True)
743 except Exception:
736 except Exception:
744 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
737 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
745
738
746 def save_task_result(self, idents, msg):
739 def save_task_result(self, idents, msg):
747 """save the result of a completed task."""
740 """save the result of a completed task."""
748 client_id = idents[0]
741 client_id = idents[0]
749 try:
742 try:
750 msg = self.session.unserialize(msg)
743 msg = self.session.unserialize(msg)
751 except Exception:
744 except Exception:
752 self.log.error("task::invalid task result message send to %r: %r",
745 self.log.error("task::invalid task result message send to %r: %r",
753 client_id, msg, exc_info=True)
746 client_id, msg, exc_info=True)
754 return
747 return
755
748
756 parent = msg['parent_header']
749 parent = msg['parent_header']
757 if not parent:
750 if not parent:
758 # print msg
751 # print msg
759 self.log.warn("Task %r had no parent!", msg)
752 self.log.warn("Task %r had no parent!", msg)
760 return
753 return
761 msg_id = parent['msg_id']
754 msg_id = parent['msg_id']
762 if msg_id in self.unassigned:
755 if msg_id in self.unassigned:
763 self.unassigned.remove(msg_id)
756 self.unassigned.remove(msg_id)
764
757
765 header = msg['header']
758 header = msg['header']
766 md = msg['metadata']
759 md = msg['metadata']
767 engine_uuid = md.get('engine', u'')
760 engine_uuid = md.get('engine', u'')
768 eid = self.by_ident.get(cast_bytes(engine_uuid), None)
761 eid = self.by_ident.get(cast_bytes(engine_uuid), None)
769
762
770 status = md.get('status', None)
763 status = md.get('status', None)
771
764
772 if msg_id in self.pending:
765 if msg_id in self.pending:
773 self.log.info("task::task %r finished on %s", msg_id, eid)
766 self.log.info("task::task %r finished on %s", msg_id, eid)
774 self.pending.remove(msg_id)
767 self.pending.remove(msg_id)
775 self.all_completed.add(msg_id)
768 self.all_completed.add(msg_id)
776 if eid is not None:
769 if eid is not None:
777 if status != 'aborted':
770 if status != 'aborted':
778 self.completed[eid].append(msg_id)
771 self.completed[eid].append(msg_id)
779 if msg_id in self.tasks[eid]:
772 if msg_id in self.tasks[eid]:
780 self.tasks[eid].remove(msg_id)
773 self.tasks[eid].remove(msg_id)
781 completed = header['date']
774 completed = header['date']
782 started = extract_dates(md.get('started', None))
775 started = extract_dates(md.get('started', None))
783 result = {
776 result = {
784 'result_header' : header,
777 'result_header' : header,
785 'result_metadata': msg['metadata'],
778 'result_metadata': msg['metadata'],
786 'result_content': msg['content'],
779 'result_content': msg['content'],
787 'started' : started,
780 'started' : started,
788 'completed' : completed,
781 'completed' : completed,
789 'received' : datetime.now(),
782 'received' : datetime.now(),
790 'engine_uuid': engine_uuid,
783 'engine_uuid': engine_uuid,
791 }
784 }
792
785
793 result['result_buffers'] = msg['buffers']
786 result['result_buffers'] = msg['buffers']
794 try:
787 try:
795 self.db.update_record(msg_id, result)
788 self.db.update_record(msg_id, result)
796 except Exception:
789 except Exception:
797 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
790 self.log.error("DB Error saving task request %r", msg_id, exc_info=True)
798
791
799 else:
792 else:
800 self.log.debug("task::unknown task %r finished", msg_id)
793 self.log.debug("task::unknown task %r finished", msg_id)
801
794
802 def save_task_destination(self, idents, msg):
795 def save_task_destination(self, idents, msg):
803 try:
796 try:
804 msg = self.session.unserialize(msg, content=True)
797 msg = self.session.unserialize(msg, content=True)
805 except Exception:
798 except Exception:
806 self.log.error("task::invalid task tracking message", exc_info=True)
799 self.log.error("task::invalid task tracking message", exc_info=True)
807 return
800 return
808 content = msg['content']
801 content = msg['content']
809 # print (content)
802 # print (content)
810 msg_id = content['msg_id']
803 msg_id = content['msg_id']
811 engine_uuid = content['engine_id']
804 engine_uuid = content['engine_id']
812 eid = self.by_ident[cast_bytes(engine_uuid)]
805 eid = self.by_ident[cast_bytes(engine_uuid)]
813
806
814 self.log.info("task::task %r arrived on %r", msg_id, eid)
807 self.log.info("task::task %r arrived on %r", msg_id, eid)
815 if msg_id in self.unassigned:
808 if msg_id in self.unassigned:
816 self.unassigned.remove(msg_id)
809 self.unassigned.remove(msg_id)
817 # else:
810 # else:
818 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
811 # self.log.debug("task::task %r not listed as MIA?!"%(msg_id))
819
812
820 self.tasks[eid].append(msg_id)
813 self.tasks[eid].append(msg_id)
821 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
814 # self.pending[msg_id][1].update(received=datetime.now(),engine=(eid,engine_uuid))
822 try:
815 try:
823 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
816 self.db.update_record(msg_id, dict(engine_uuid=engine_uuid))
824 except Exception:
817 except Exception:
825 self.log.error("DB Error saving task destination %r", msg_id, exc_info=True)
818 self.log.error("DB Error saving task destination %r", msg_id, exc_info=True)
826
819
827
820
828 def mia_task_request(self, idents, msg):
821 def mia_task_request(self, idents, msg):
829 raise NotImplementedError
822 raise NotImplementedError
830 client_id = idents[0]
823 client_id = idents[0]
831 # content = dict(mia=self.mia,status='ok')
824 # content = dict(mia=self.mia,status='ok')
832 # self.session.send('mia_reply', content=content, idents=client_id)
825 # self.session.send('mia_reply', content=content, idents=client_id)
833
826
834
827
835 #--------------------- IOPub Traffic ------------------------------
828 #--------------------- IOPub Traffic ------------------------------
836
829
837 def save_iopub_message(self, topics, msg):
830 def save_iopub_message(self, topics, msg):
838 """save an iopub message into the db"""
831 """save an iopub message into the db"""
839 # print (topics)
832 # print (topics)
840 try:
833 try:
841 msg = self.session.unserialize(msg, content=True)
834 msg = self.session.unserialize(msg, content=True)
842 except Exception:
835 except Exception:
843 self.log.error("iopub::invalid IOPub message", exc_info=True)
836 self.log.error("iopub::invalid IOPub message", exc_info=True)
844 return
837 return
845
838
846 parent = msg['parent_header']
839 parent = msg['parent_header']
847 if not parent:
840 if not parent:
848 self.log.debug("iopub::IOPub message lacks parent: %r", msg)
841 self.log.debug("iopub::IOPub message lacks parent: %r", msg)
849 return
842 return
850 msg_id = parent['msg_id']
843 msg_id = parent['msg_id']
851 msg_type = msg['header']['msg_type']
844 msg_type = msg['header']['msg_type']
852 content = msg['content']
845 content = msg['content']
853
846
854 # ensure msg_id is in db
847 # ensure msg_id is in db
855 try:
848 try:
856 rec = self.db.get_record(msg_id)
849 rec = self.db.get_record(msg_id)
857 except KeyError:
850 except KeyError:
858 rec = None
851 rec = None
859
852
860 # stream
853 # stream
861 d = {}
854 d = {}
862 if msg_type == 'stream':
855 if msg_type == 'stream':
863 name = content['name']
856 name = content['name']
864 s = '' if rec is None else rec[name]
857 s = '' if rec is None else rec[name]
865 d[name] = s + content['data']
858 d[name] = s + content['data']
866
859
867 elif msg_type == 'error':
860 elif msg_type == 'error':
868 d['error'] = content
861 d['error'] = content
869 elif msg_type == 'execute_input':
862 elif msg_type == 'execute_input':
870 d['execute_input'] = content['code']
863 d['execute_input'] = content['code']
871 elif msg_type in ('display_data', 'execute_result'):
864 elif msg_type in ('display_data', 'execute_result'):
872 d[msg_type] = content
865 d[msg_type] = content
873 elif msg_type == 'status':
866 elif msg_type == 'status':
874 pass
867 pass
875 elif msg_type == 'data_pub':
868 elif msg_type == 'data_pub':
876 self.log.info("ignored data_pub message for %s" % msg_id)
869 self.log.info("ignored data_pub message for %s" % msg_id)
877 else:
870 else:
878 self.log.warn("unhandled iopub msg_type: %r", msg_type)
871 self.log.warn("unhandled iopub msg_type: %r", msg_type)
879
872
880 if not d:
873 if not d:
881 return
874 return
882
875
883 if rec is None:
876 if rec is None:
884 # new record
877 # new record
885 rec = empty_record()
878 rec = empty_record()
886 rec['msg_id'] = msg_id
879 rec['msg_id'] = msg_id
887 rec.update(d)
880 rec.update(d)
888 d = rec
881 d = rec
889 update_record = self.db.add_record
882 update_record = self.db.add_record
890 else:
883 else:
891 update_record = self.db.update_record
884 update_record = self.db.update_record
892
885
893 try:
886 try:
894 update_record(msg_id, d)
887 update_record(msg_id, d)
895 except Exception:
888 except Exception:
896 self.log.error("DB Error saving iopub message %r", msg_id, exc_info=True)
889 self.log.error("DB Error saving iopub message %r", msg_id, exc_info=True)
897
890
898
891
899
892
900 #-------------------------------------------------------------------------
893 #-------------------------------------------------------------------------
901 # Registration requests
894 # Registration requests
902 #-------------------------------------------------------------------------
895 #-------------------------------------------------------------------------
903
896
904 def connection_request(self, client_id, msg):
897 def connection_request(self, client_id, msg):
905 """Reply with connection addresses for clients."""
898 """Reply with connection addresses for clients."""
906 self.log.info("client::client %r connected", client_id)
899 self.log.info("client::client %r connected", client_id)
907 content = dict(status='ok')
900 content = dict(status='ok')
908 jsonable = {}
901 jsonable = {}
909 for k,v in iteritems(self.keytable):
902 for k,v in iteritems(self.keytable):
910 if v not in self.dead_engines:
903 if v not in self.dead_engines:
911 jsonable[str(k)] = v
904 jsonable[str(k)] = v
912 content['engines'] = jsonable
905 content['engines'] = jsonable
913 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
906 self.session.send(self.query, 'connection_reply', content, parent=msg, ident=client_id)
914
907
915 def register_engine(self, reg, msg):
908 def register_engine(self, reg, msg):
916 """Register a new engine."""
909 """Register a new engine."""
917 content = msg['content']
910 content = msg['content']
918 try:
911 try:
919 uuid = content['uuid']
912 uuid = content['uuid']
920 except KeyError:
913 except KeyError:
921 self.log.error("registration::queue not specified", exc_info=True)
914 self.log.error("registration::queue not specified", exc_info=True)
922 return
915 return
923
916
924 eid = self._next_id
917 eid = self._next_id
925
918
926 self.log.debug("registration::register_engine(%i, %r)", eid, uuid)
919 self.log.debug("registration::register_engine(%i, %r)", eid, uuid)
927
920
928 content = dict(id=eid,status='ok',hb_period=self.heartmonitor.period)
921 content = dict(id=eid,status='ok',hb_period=self.heartmonitor.period)
929 # check if requesting available IDs:
922 # check if requesting available IDs:
930 if cast_bytes(uuid) in self.by_ident:
923 if cast_bytes(uuid) in self.by_ident:
931 try:
924 try:
932 raise KeyError("uuid %r in use" % uuid)
925 raise KeyError("uuid %r in use" % uuid)
933 except:
926 except:
934 content = error.wrap_exception()
927 content = error.wrap_exception()
935 self.log.error("uuid %r in use", uuid, exc_info=True)
928 self.log.error("uuid %r in use", uuid, exc_info=True)
936 else:
929 else:
937 for h, ec in iteritems(self.incoming_registrations):
930 for h, ec in iteritems(self.incoming_registrations):
938 if uuid == h:
931 if uuid == h:
939 try:
932 try:
940 raise KeyError("heart_id %r in use" % uuid)
933 raise KeyError("heart_id %r in use" % uuid)
941 except:
934 except:
942 self.log.error("heart_id %r in use", uuid, exc_info=True)
935 self.log.error("heart_id %r in use", uuid, exc_info=True)
943 content = error.wrap_exception()
936 content = error.wrap_exception()
944 break
937 break
945 elif uuid == ec.uuid:
938 elif uuid == ec.uuid:
946 try:
939 try:
947 raise KeyError("uuid %r in use" % uuid)
940 raise KeyError("uuid %r in use" % uuid)
948 except:
941 except:
949 self.log.error("uuid %r in use", uuid, exc_info=True)
942 self.log.error("uuid %r in use", uuid, exc_info=True)
950 content = error.wrap_exception()
943 content = error.wrap_exception()
951 break
944 break
952
945
953 msg = self.session.send(self.query, "registration_reply",
946 msg = self.session.send(self.query, "registration_reply",
954 content=content,
947 content=content,
955 ident=reg)
948 ident=reg)
956
949
957 heart = cast_bytes(uuid)
950 heart = cast_bytes(uuid)
958
951
959 if content['status'] == 'ok':
952 if content['status'] == 'ok':
960 if heart in self.heartmonitor.hearts:
953 if heart in self.heartmonitor.hearts:
961 # already beating
954 # already beating
962 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid)
955 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid)
963 self.finish_registration(heart)
956 self.finish_registration(heart)
964 else:
957 else:
965 purge = lambda : self._purge_stalled_registration(heart)
958 purge = lambda : self._purge_stalled_registration(heart)
966 dc = ioloop.DelayedCallback(purge, self.registration_timeout, self.loop)
959 t = self.loop.add_timeout(
967 dc.start()
960 self.loop.time() + self.registration_timeout,
968 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid,stallback=dc)
961 purge,
962 )
963 self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid,stallback=t)
969 else:
964 else:
970 self.log.error("registration::registration %i failed: %r", eid, content['evalue'])
965 self.log.error("registration::registration %i failed: %r", eid, content['evalue'])
971
966
972 return eid
967 return eid
973
968
974 def unregister_engine(self, ident, msg):
969 def unregister_engine(self, ident, msg):
975 """Unregister an engine that explicitly requested to leave."""
970 """Unregister an engine that explicitly requested to leave."""
976 try:
971 try:
977 eid = msg['content']['id']
972 eid = msg['content']['id']
978 except:
973 except:
979 self.log.error("registration::bad engine id for unregistration: %r", ident, exc_info=True)
974 self.log.error("registration::bad engine id for unregistration: %r", ident, exc_info=True)
980 return
975 return
981 self.log.info("registration::unregister_engine(%r)", eid)
976 self.log.info("registration::unregister_engine(%r)", eid)
982 # print (eid)
977
983 uuid = self.keytable[eid]
978 uuid = self.keytable[eid]
984 content=dict(id=eid, uuid=uuid)
979 content=dict(id=eid, uuid=uuid)
985 self.dead_engines.add(uuid)
980 self.dead_engines.add(uuid)
986 # self.ids.remove(eid)
981
987 # uuid = self.keytable.pop(eid)
982 self.loop.add_timeout(
988 #
983 self.loop.time() + self.registration_timeout,
989 # ec = self.engines.pop(eid)
984 lambda : self._handle_stranded_msgs(eid, uuid),
990 # self.hearts.pop(ec.heartbeat)
985 )
991 # self.by_ident.pop(ec.queue)
992 # self.completed.pop(eid)
993 handleit = lambda : self._handle_stranded_msgs(eid, uuid)
994 dc = ioloop.DelayedCallback(handleit, self.registration_timeout, self.loop)
995 dc.start()
996 ############## TODO: HANDLE IT ################
986 ############## TODO: HANDLE IT ################
997
987
998 self._save_engine_state()
988 self._save_engine_state()
999
989
1000 if self.notifier:
990 if self.notifier:
1001 self.session.send(self.notifier, "unregistration_notification", content=content)
991 self.session.send(self.notifier, "unregistration_notification", content=content)
1002
992
1003 def _handle_stranded_msgs(self, eid, uuid):
993 def _handle_stranded_msgs(self, eid, uuid):
1004 """Handle messages known to be on an engine when the engine unregisters.
994 """Handle messages known to be on an engine when the engine unregisters.
1005
995
1006 It is possible that this will fire prematurely - that is, an engine will
996 It is possible that this will fire prematurely - that is, an engine will
1007 go down after completing a result, and the client will be notified
997 go down after completing a result, and the client will be notified
1008 that the result failed and later receive the actual result.
998 that the result failed and later receive the actual result.
1009 """
999 """
1010
1000
1011 outstanding = self.queues[eid]
1001 outstanding = self.queues[eid]
1012
1002
1013 for msg_id in outstanding:
1003 for msg_id in outstanding:
1014 self.pending.remove(msg_id)
1004 self.pending.remove(msg_id)
1015 self.all_completed.add(msg_id)
1005 self.all_completed.add(msg_id)
1016 try:
1006 try:
1017 raise error.EngineError("Engine %r died while running task %r" % (eid, msg_id))
1007 raise error.EngineError("Engine %r died while running task %r" % (eid, msg_id))
1018 except:
1008 except:
1019 content = error.wrap_exception()
1009 content = error.wrap_exception()
1020 # build a fake header:
1010 # build a fake header:
1021 header = {}
1011 header = {}
1022 header['engine'] = uuid
1012 header['engine'] = uuid
1023 header['date'] = datetime.now()
1013 header['date'] = datetime.now()
1024 rec = dict(result_content=content, result_header=header, result_buffers=[])
1014 rec = dict(result_content=content, result_header=header, result_buffers=[])
1025 rec['completed'] = header['date']
1015 rec['completed'] = header['date']
1026 rec['engine_uuid'] = uuid
1016 rec['engine_uuid'] = uuid
1027 try:
1017 try:
1028 self.db.update_record(msg_id, rec)
1018 self.db.update_record(msg_id, rec)
1029 except Exception:
1019 except Exception:
1030 self.log.error("DB Error handling stranded msg %r", msg_id, exc_info=True)
1020 self.log.error("DB Error handling stranded msg %r", msg_id, exc_info=True)
1031
1021
1032
1022
1033 def finish_registration(self, heart):
1023 def finish_registration(self, heart):
1034 """Second half of engine registration, called after our HeartMonitor
1024 """Second half of engine registration, called after our HeartMonitor
1035 has received a beat from the Engine's Heart."""
1025 has received a beat from the Engine's Heart."""
1036 try:
1026 try:
1037 ec = self.incoming_registrations.pop(heart)
1027 ec = self.incoming_registrations.pop(heart)
1038 except KeyError:
1028 except KeyError:
1039 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
1029 self.log.error("registration::tried to finish nonexistant registration", exc_info=True)
1040 return
1030 return
1041 self.log.info("registration::finished registering engine %i:%s", ec.id, ec.uuid)
1031 self.log.info("registration::finished registering engine %i:%s", ec.id, ec.uuid)
1042 if ec.stallback is not None:
1032 if ec.stallback is not None:
1043 ec.stallback.stop()
1033 self.loop.remove_timeout(ec.stallback)
1044 eid = ec.id
1034 eid = ec.id
1045 self.ids.add(eid)
1035 self.ids.add(eid)
1046 self.keytable[eid] = ec.uuid
1036 self.keytable[eid] = ec.uuid
1047 self.engines[eid] = ec
1037 self.engines[eid] = ec
1048 self.by_ident[cast_bytes(ec.uuid)] = ec.id
1038 self.by_ident[cast_bytes(ec.uuid)] = ec.id
1049 self.queues[eid] = list()
1039 self.queues[eid] = list()
1050 self.tasks[eid] = list()
1040 self.tasks[eid] = list()
1051 self.completed[eid] = list()
1041 self.completed[eid] = list()
1052 self.hearts[heart] = eid
1042 self.hearts[heart] = eid
1053 content = dict(id=eid, uuid=self.engines[eid].uuid)
1043 content = dict(id=eid, uuid=self.engines[eid].uuid)
1054 if self.notifier:
1044 if self.notifier:
1055 self.session.send(self.notifier, "registration_notification", content=content)
1045 self.session.send(self.notifier, "registration_notification", content=content)
1056 self.log.info("engine::Engine Connected: %i", eid)
1046 self.log.info("engine::Engine Connected: %i", eid)
1057
1047
1058 self._save_engine_state()
1048 self._save_engine_state()
1059
1049
1060 def _purge_stalled_registration(self, heart):
1050 def _purge_stalled_registration(self, heart):
1061 if heart in self.incoming_registrations:
1051 if heart in self.incoming_registrations:
1062 ec = self.incoming_registrations.pop(heart)
1052 ec = self.incoming_registrations.pop(heart)
1063 self.log.info("registration::purging stalled registration: %i", ec.id)
1053 self.log.info("registration::purging stalled registration: %i", ec.id)
1064 else:
1054 else:
1065 pass
1055 pass
1066
1056
1067 #-------------------------------------------------------------------------
1057 #-------------------------------------------------------------------------
1068 # Engine State
1058 # Engine State
1069 #-------------------------------------------------------------------------
1059 #-------------------------------------------------------------------------
1070
1060
1071
1061
1072 def _cleanup_engine_state_file(self):
1062 def _cleanup_engine_state_file(self):
1073 """cleanup engine state mapping"""
1063 """cleanup engine state mapping"""
1074
1064
1075 if os.path.exists(self.engine_state_file):
1065 if os.path.exists(self.engine_state_file):
1076 self.log.debug("cleaning up engine state: %s", self.engine_state_file)
1066 self.log.debug("cleaning up engine state: %s", self.engine_state_file)
1077 try:
1067 try:
1078 os.remove(self.engine_state_file)
1068 os.remove(self.engine_state_file)
1079 except IOError:
1069 except IOError:
1080 self.log.error("Couldn't cleanup file: %s", self.engine_state_file, exc_info=True)
1070 self.log.error("Couldn't cleanup file: %s", self.engine_state_file, exc_info=True)
1081
1071
1082
1072
1083 def _save_engine_state(self):
1073 def _save_engine_state(self):
1084 """save engine mapping to JSON file"""
1074 """save engine mapping to JSON file"""
1085 if not self.engine_state_file:
1075 if not self.engine_state_file:
1086 return
1076 return
1087 self.log.debug("save engine state to %s" % self.engine_state_file)
1077 self.log.debug("save engine state to %s" % self.engine_state_file)
1088 state = {}
1078 state = {}
1089 engines = {}
1079 engines = {}
1090 for eid, ec in iteritems(self.engines):
1080 for eid, ec in iteritems(self.engines):
1091 if ec.uuid not in self.dead_engines:
1081 if ec.uuid not in self.dead_engines:
1092 engines[eid] = ec.uuid
1082 engines[eid] = ec.uuid
1093
1083
1094 state['engines'] = engines
1084 state['engines'] = engines
1095
1085
1096 state['next_id'] = self._idcounter
1086 state['next_id'] = self._idcounter
1097
1087
1098 with open(self.engine_state_file, 'w') as f:
1088 with open(self.engine_state_file, 'w') as f:
1099 json.dump(state, f)
1089 json.dump(state, f)
1100
1090
1101
1091
1102 def _load_engine_state(self):
1092 def _load_engine_state(self):
1103 """load engine mapping from JSON file"""
1093 """load engine mapping from JSON file"""
1104 if not os.path.exists(self.engine_state_file):
1094 if not os.path.exists(self.engine_state_file):
1105 return
1095 return
1106
1096
1107 self.log.info("loading engine state from %s" % self.engine_state_file)
1097 self.log.info("loading engine state from %s" % self.engine_state_file)
1108
1098
1109 with open(self.engine_state_file) as f:
1099 with open(self.engine_state_file) as f:
1110 state = json.load(f)
1100 state = json.load(f)
1111
1101
1112 save_notifier = self.notifier
1102 save_notifier = self.notifier
1113 self.notifier = None
1103 self.notifier = None
1114 for eid, uuid in iteritems(state['engines']):
1104 for eid, uuid in iteritems(state['engines']):
1115 heart = uuid.encode('ascii')
1105 heart = uuid.encode('ascii')
1116 # start with this heart as current and beating:
1106 # start with this heart as current and beating:
1117 self.heartmonitor.responses.add(heart)
1107 self.heartmonitor.responses.add(heart)
1118 self.heartmonitor.hearts.add(heart)
1108 self.heartmonitor.hearts.add(heart)
1119
1109
1120 self.incoming_registrations[heart] = EngineConnector(id=int(eid), uuid=uuid)
1110 self.incoming_registrations[heart] = EngineConnector(id=int(eid), uuid=uuid)
1121 self.finish_registration(heart)
1111 self.finish_registration(heart)
1122
1112
1123 self.notifier = save_notifier
1113 self.notifier = save_notifier
1124
1114
1125 self._idcounter = state['next_id']
1115 self._idcounter = state['next_id']
1126
1116
1127 #-------------------------------------------------------------------------
1117 #-------------------------------------------------------------------------
1128 # Client Requests
1118 # Client Requests
1129 #-------------------------------------------------------------------------
1119 #-------------------------------------------------------------------------
1130
1120
1131 def shutdown_request(self, client_id, msg):
1121 def shutdown_request(self, client_id, msg):
1132 """handle shutdown request."""
1122 """handle shutdown request."""
1133 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1123 self.session.send(self.query, 'shutdown_reply', content={'status': 'ok'}, ident=client_id)
1134 # also notify other clients of shutdown
1124 # also notify other clients of shutdown
1135 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1125 self.session.send(self.notifier, 'shutdown_notice', content={'status': 'ok'})
1136 dc = ioloop.DelayedCallback(lambda : self._shutdown(), 1000, self.loop)
1126 self.loop.add_timeout(self.loop.time() + 1, self._shutdown)
1137 dc.start()
1138
1127
1139 def _shutdown(self):
1128 def _shutdown(self):
1140 self.log.info("hub::hub shutting down.")
1129 self.log.info("hub::hub shutting down.")
1141 time.sleep(0.1)
1130 time.sleep(0.1)
1142 sys.exit(0)
1131 sys.exit(0)
1143
1132
1144
1133
1145 def check_load(self, client_id, msg):
1134 def check_load(self, client_id, msg):
1146 content = msg['content']
1135 content = msg['content']
1147 try:
1136 try:
1148 targets = content['targets']
1137 targets = content['targets']
1149 targets = self._validate_targets(targets)
1138 targets = self._validate_targets(targets)
1150 except:
1139 except:
1151 content = error.wrap_exception()
1140 content = error.wrap_exception()
1152 self.session.send(self.query, "hub_error",
1141 self.session.send(self.query, "hub_error",
1153 content=content, ident=client_id)
1142 content=content, ident=client_id)
1154 return
1143 return
1155
1144
1156 content = dict(status='ok')
1145 content = dict(status='ok')
1157 # loads = {}
1146 # loads = {}
1158 for t in targets:
1147 for t in targets:
1159 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1148 content[bytes(t)] = len(self.queues[t])+len(self.tasks[t])
1160 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1149 self.session.send(self.query, "load_reply", content=content, ident=client_id)
1161
1150
1162
1151
1163 def queue_status(self, client_id, msg):
1152 def queue_status(self, client_id, msg):
1164 """Return the Queue status of one or more targets.
1153 """Return the Queue status of one or more targets.
1165
1154
1166 If verbose, return the msg_ids, else return len of each type.
1155 If verbose, return the msg_ids, else return len of each type.
1167
1156
1168 Keys:
1157 Keys:
1169
1158
1170 * queue (pending MUX jobs)
1159 * queue (pending MUX jobs)
1171 * tasks (pending Task jobs)
1160 * tasks (pending Task jobs)
1172 * completed (finished jobs from both queues)
1161 * completed (finished jobs from both queues)
1173 """
1162 """
1174 content = msg['content']
1163 content = msg['content']
1175 targets = content['targets']
1164 targets = content['targets']
1176 try:
1165 try:
1177 targets = self._validate_targets(targets)
1166 targets = self._validate_targets(targets)
1178 except:
1167 except:
1179 content = error.wrap_exception()
1168 content = error.wrap_exception()
1180 self.session.send(self.query, "hub_error",
1169 self.session.send(self.query, "hub_error",
1181 content=content, ident=client_id)
1170 content=content, ident=client_id)
1182 return
1171 return
1183 verbose = content.get('verbose', False)
1172 verbose = content.get('verbose', False)
1184 content = dict(status='ok')
1173 content = dict(status='ok')
1185 for t in targets:
1174 for t in targets:
1186 queue = self.queues[t]
1175 queue = self.queues[t]
1187 completed = self.completed[t]
1176 completed = self.completed[t]
1188 tasks = self.tasks[t]
1177 tasks = self.tasks[t]
1189 if not verbose:
1178 if not verbose:
1190 queue = len(queue)
1179 queue = len(queue)
1191 completed = len(completed)
1180 completed = len(completed)
1192 tasks = len(tasks)
1181 tasks = len(tasks)
1193 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1182 content[str(t)] = {'queue': queue, 'completed': completed , 'tasks': tasks}
1194 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1183 content['unassigned'] = list(self.unassigned) if verbose else len(self.unassigned)
1195 # print (content)
1184 # print (content)
1196 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1185 self.session.send(self.query, "queue_reply", content=content, ident=client_id)
1197
1186
1198 def purge_results(self, client_id, msg):
1187 def purge_results(self, client_id, msg):
1199 """Purge results from memory. This method is more valuable before we move
1188 """Purge results from memory. This method is more valuable before we move
1200 to a DB based message storage mechanism."""
1189 to a DB based message storage mechanism."""
1201 content = msg['content']
1190 content = msg['content']
1202 self.log.info("Dropping records with %s", content)
1191 self.log.info("Dropping records with %s", content)
1203 msg_ids = content.get('msg_ids', [])
1192 msg_ids = content.get('msg_ids', [])
1204 reply = dict(status='ok')
1193 reply = dict(status='ok')
1205 if msg_ids == 'all':
1194 if msg_ids == 'all':
1206 try:
1195 try:
1207 self.db.drop_matching_records(dict(completed={'$ne':None}))
1196 self.db.drop_matching_records(dict(completed={'$ne':None}))
1208 except Exception:
1197 except Exception:
1209 reply = error.wrap_exception()
1198 reply = error.wrap_exception()
1210 self.log.exception("Error dropping records")
1199 self.log.exception("Error dropping records")
1211 else:
1200 else:
1212 pending = [m for m in msg_ids if (m in self.pending)]
1201 pending = [m for m in msg_ids if (m in self.pending)]
1213 if pending:
1202 if pending:
1214 try:
1203 try:
1215 raise IndexError("msg pending: %r" % pending[0])
1204 raise IndexError("msg pending: %r" % pending[0])
1216 except:
1205 except:
1217 reply = error.wrap_exception()
1206 reply = error.wrap_exception()
1218 self.log.exception("Error dropping records")
1207 self.log.exception("Error dropping records")
1219 else:
1208 else:
1220 try:
1209 try:
1221 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1210 self.db.drop_matching_records(dict(msg_id={'$in':msg_ids}))
1222 except Exception:
1211 except Exception:
1223 reply = error.wrap_exception()
1212 reply = error.wrap_exception()
1224 self.log.exception("Error dropping records")
1213 self.log.exception("Error dropping records")
1225
1214
1226 if reply['status'] == 'ok':
1215 if reply['status'] == 'ok':
1227 eids = content.get('engine_ids', [])
1216 eids = content.get('engine_ids', [])
1228 for eid in eids:
1217 for eid in eids:
1229 if eid not in self.engines:
1218 if eid not in self.engines:
1230 try:
1219 try:
1231 raise IndexError("No such engine: %i" % eid)
1220 raise IndexError("No such engine: %i" % eid)
1232 except:
1221 except:
1233 reply = error.wrap_exception()
1222 reply = error.wrap_exception()
1234 self.log.exception("Error dropping records")
1223 self.log.exception("Error dropping records")
1235 break
1224 break
1236 uid = self.engines[eid].uuid
1225 uid = self.engines[eid].uuid
1237 try:
1226 try:
1238 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1227 self.db.drop_matching_records(dict(engine_uuid=uid, completed={'$ne':None}))
1239 except Exception:
1228 except Exception:
1240 reply = error.wrap_exception()
1229 reply = error.wrap_exception()
1241 self.log.exception("Error dropping records")
1230 self.log.exception("Error dropping records")
1242 break
1231 break
1243
1232
1244 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1233 self.session.send(self.query, 'purge_reply', content=reply, ident=client_id)
1245
1234
1246 def resubmit_task(self, client_id, msg):
1235 def resubmit_task(self, client_id, msg):
1247 """Resubmit one or more tasks."""
1236 """Resubmit one or more tasks."""
1248 def finish(reply):
1237 def finish(reply):
1249 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1238 self.session.send(self.query, 'resubmit_reply', content=reply, ident=client_id)
1250
1239
1251 content = msg['content']
1240 content = msg['content']
1252 msg_ids = content['msg_ids']
1241 msg_ids = content['msg_ids']
1253 reply = dict(status='ok')
1242 reply = dict(status='ok')
1254 try:
1243 try:
1255 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1244 records = self.db.find_records({'msg_id' : {'$in' : msg_ids}}, keys=[
1256 'header', 'content', 'buffers'])
1245 'header', 'content', 'buffers'])
1257 except Exception:
1246 except Exception:
1258 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1247 self.log.error('db::db error finding tasks to resubmit', exc_info=True)
1259 return finish(error.wrap_exception())
1248 return finish(error.wrap_exception())
1260
1249
1261 # validate msg_ids
1250 # validate msg_ids
1262 found_ids = [ rec['msg_id'] for rec in records ]
1251 found_ids = [ rec['msg_id'] for rec in records ]
1263 pending_ids = [ msg_id for msg_id in found_ids if msg_id in self.pending ]
1252 pending_ids = [ msg_id for msg_id in found_ids if msg_id in self.pending ]
1264 if len(records) > len(msg_ids):
1253 if len(records) > len(msg_ids):
1265 try:
1254 try:
1266 raise RuntimeError("DB appears to be in an inconsistent state."
1255 raise RuntimeError("DB appears to be in an inconsistent state."
1267 "More matching records were found than should exist")
1256 "More matching records were found than should exist")
1268 except Exception:
1257 except Exception:
1269 self.log.exception("Failed to resubmit task")
1258 self.log.exception("Failed to resubmit task")
1270 return finish(error.wrap_exception())
1259 return finish(error.wrap_exception())
1271 elif len(records) < len(msg_ids):
1260 elif len(records) < len(msg_ids):
1272 missing = [ m for m in msg_ids if m not in found_ids ]
1261 missing = [ m for m in msg_ids if m not in found_ids ]
1273 try:
1262 try:
1274 raise KeyError("No such msg(s): %r" % missing)
1263 raise KeyError("No such msg(s): %r" % missing)
1275 except KeyError:
1264 except KeyError:
1276 self.log.exception("Failed to resubmit task")
1265 self.log.exception("Failed to resubmit task")
1277 return finish(error.wrap_exception())
1266 return finish(error.wrap_exception())
1278 elif pending_ids:
1267 elif pending_ids:
1279 pass
1268 pass
1280 # no need to raise on resubmit of pending task, now that we
1269 # no need to raise on resubmit of pending task, now that we
1281 # resubmit under new ID, but do we want to raise anyway?
1270 # resubmit under new ID, but do we want to raise anyway?
1282 # msg_id = invalid_ids[0]
1271 # msg_id = invalid_ids[0]
1283 # try:
1272 # try:
1284 # raise ValueError("Task(s) %r appears to be inflight" % )
1273 # raise ValueError("Task(s) %r appears to be inflight" % )
1285 # except Exception:
1274 # except Exception:
1286 # return finish(error.wrap_exception())
1275 # return finish(error.wrap_exception())
1287
1276
1288 # mapping of original IDs to resubmitted IDs
1277 # mapping of original IDs to resubmitted IDs
1289 resubmitted = {}
1278 resubmitted = {}
1290
1279
1291 # send the messages
1280 # send the messages
1292 for rec in records:
1281 for rec in records:
1293 header = rec['header']
1282 header = rec['header']
1294 msg = self.session.msg(header['msg_type'], parent=header)
1283 msg = self.session.msg(header['msg_type'], parent=header)
1295 msg_id = msg['msg_id']
1284 msg_id = msg['msg_id']
1296 msg['content'] = rec['content']
1285 msg['content'] = rec['content']
1297
1286
1298 # use the old header, but update msg_id and timestamp
1287 # use the old header, but update msg_id and timestamp
1299 fresh = msg['header']
1288 fresh = msg['header']
1300 header['msg_id'] = fresh['msg_id']
1289 header['msg_id'] = fresh['msg_id']
1301 header['date'] = fresh['date']
1290 header['date'] = fresh['date']
1302 msg['header'] = header
1291 msg['header'] = header
1303
1292
1304 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1293 self.session.send(self.resubmit, msg, buffers=rec['buffers'])
1305
1294
1306 resubmitted[rec['msg_id']] = msg_id
1295 resubmitted[rec['msg_id']] = msg_id
1307 self.pending.add(msg_id)
1296 self.pending.add(msg_id)
1308 msg['buffers'] = rec['buffers']
1297 msg['buffers'] = rec['buffers']
1309 try:
1298 try:
1310 self.db.add_record(msg_id, init_record(msg))
1299 self.db.add_record(msg_id, init_record(msg))
1311 except Exception:
1300 except Exception:
1312 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1301 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1313 return finish(error.wrap_exception())
1302 return finish(error.wrap_exception())
1314
1303
1315 finish(dict(status='ok', resubmitted=resubmitted))
1304 finish(dict(status='ok', resubmitted=resubmitted))
1316
1305
1317 # store the new IDs in the Task DB
1306 # store the new IDs in the Task DB
1318 for msg_id, resubmit_id in iteritems(resubmitted):
1307 for msg_id, resubmit_id in iteritems(resubmitted):
1319 try:
1308 try:
1320 self.db.update_record(msg_id, {'resubmitted' : resubmit_id})
1309 self.db.update_record(msg_id, {'resubmitted' : resubmit_id})
1321 except Exception:
1310 except Exception:
1322 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1311 self.log.error("db::DB Error updating record: %s", msg_id, exc_info=True)
1323
1312
1324
1313
1325 def _extract_record(self, rec):
1314 def _extract_record(self, rec):
1326 """decompose a TaskRecord dict into subsection of reply for get_result"""
1315 """decompose a TaskRecord dict into subsection of reply for get_result"""
1327 io_dict = {}
1316 io_dict = {}
1328 for key in ('execute_input', 'execute_result', 'error', 'stdout', 'stderr'):
1317 for key in ('execute_input', 'execute_result', 'error', 'stdout', 'stderr'):
1329 io_dict[key] = rec[key]
1318 io_dict[key] = rec[key]
1330 content = {
1319 content = {
1331 'header': rec['header'],
1320 'header': rec['header'],
1332 'metadata': rec['metadata'],
1321 'metadata': rec['metadata'],
1333 'result_metadata': rec['result_metadata'],
1322 'result_metadata': rec['result_metadata'],
1334 'result_header' : rec['result_header'],
1323 'result_header' : rec['result_header'],
1335 'result_content': rec['result_content'],
1324 'result_content': rec['result_content'],
1336 'received' : rec['received'],
1325 'received' : rec['received'],
1337 'io' : io_dict,
1326 'io' : io_dict,
1338 }
1327 }
1339 if rec['result_buffers']:
1328 if rec['result_buffers']:
1340 buffers = list(map(bytes, rec['result_buffers']))
1329 buffers = list(map(bytes, rec['result_buffers']))
1341 else:
1330 else:
1342 buffers = []
1331 buffers = []
1343
1332
1344 return content, buffers
1333 return content, buffers
1345
1334
1346 def get_results(self, client_id, msg):
1335 def get_results(self, client_id, msg):
1347 """Get the result of 1 or more messages."""
1336 """Get the result of 1 or more messages."""
1348 content = msg['content']
1337 content = msg['content']
1349 msg_ids = sorted(set(content['msg_ids']))
1338 msg_ids = sorted(set(content['msg_ids']))
1350 statusonly = content.get('status_only', False)
1339 statusonly = content.get('status_only', False)
1351 pending = []
1340 pending = []
1352 completed = []
1341 completed = []
1353 content = dict(status='ok')
1342 content = dict(status='ok')
1354 content['pending'] = pending
1343 content['pending'] = pending
1355 content['completed'] = completed
1344 content['completed'] = completed
1356 buffers = []
1345 buffers = []
1357 if not statusonly:
1346 if not statusonly:
1358 try:
1347 try:
1359 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1348 matches = self.db.find_records(dict(msg_id={'$in':msg_ids}))
1360 # turn match list into dict, for faster lookup
1349 # turn match list into dict, for faster lookup
1361 records = {}
1350 records = {}
1362 for rec in matches:
1351 for rec in matches:
1363 records[rec['msg_id']] = rec
1352 records[rec['msg_id']] = rec
1364 except Exception:
1353 except Exception:
1365 content = error.wrap_exception()
1354 content = error.wrap_exception()
1366 self.log.exception("Failed to get results")
1355 self.log.exception("Failed to get results")
1367 self.session.send(self.query, "result_reply", content=content,
1356 self.session.send(self.query, "result_reply", content=content,
1368 parent=msg, ident=client_id)
1357 parent=msg, ident=client_id)
1369 return
1358 return
1370 else:
1359 else:
1371 records = {}
1360 records = {}
1372 for msg_id in msg_ids:
1361 for msg_id in msg_ids:
1373 if msg_id in self.pending:
1362 if msg_id in self.pending:
1374 pending.append(msg_id)
1363 pending.append(msg_id)
1375 elif msg_id in self.all_completed:
1364 elif msg_id in self.all_completed:
1376 completed.append(msg_id)
1365 completed.append(msg_id)
1377 if not statusonly:
1366 if not statusonly:
1378 c,bufs = self._extract_record(records[msg_id])
1367 c,bufs = self._extract_record(records[msg_id])
1379 content[msg_id] = c
1368 content[msg_id] = c
1380 buffers.extend(bufs)
1369 buffers.extend(bufs)
1381 elif msg_id in records:
1370 elif msg_id in records:
1382 if rec['completed']:
1371 if rec['completed']:
1383 completed.append(msg_id)
1372 completed.append(msg_id)
1384 c,bufs = self._extract_record(records[msg_id])
1373 c,bufs = self._extract_record(records[msg_id])
1385 content[msg_id] = c
1374 content[msg_id] = c
1386 buffers.extend(bufs)
1375 buffers.extend(bufs)
1387 else:
1376 else:
1388 pending.append(msg_id)
1377 pending.append(msg_id)
1389 else:
1378 else:
1390 try:
1379 try:
1391 raise KeyError('No such message: '+msg_id)
1380 raise KeyError('No such message: '+msg_id)
1392 except:
1381 except:
1393 content = error.wrap_exception()
1382 content = error.wrap_exception()
1394 break
1383 break
1395 self.session.send(self.query, "result_reply", content=content,
1384 self.session.send(self.query, "result_reply", content=content,
1396 parent=msg, ident=client_id,
1385 parent=msg, ident=client_id,
1397 buffers=buffers)
1386 buffers=buffers)
1398
1387
1399 def get_history(self, client_id, msg):
1388 def get_history(self, client_id, msg):
1400 """Get a list of all msg_ids in our DB records"""
1389 """Get a list of all msg_ids in our DB records"""
1401 try:
1390 try:
1402 msg_ids = self.db.get_history()
1391 msg_ids = self.db.get_history()
1403 except Exception as e:
1392 except Exception as e:
1404 content = error.wrap_exception()
1393 content = error.wrap_exception()
1405 self.log.exception("Failed to get history")
1394 self.log.exception("Failed to get history")
1406 else:
1395 else:
1407 content = dict(status='ok', history=msg_ids)
1396 content = dict(status='ok', history=msg_ids)
1408
1397
1409 self.session.send(self.query, "history_reply", content=content,
1398 self.session.send(self.query, "history_reply", content=content,
1410 parent=msg, ident=client_id)
1399 parent=msg, ident=client_id)
1411
1400
1412 def db_query(self, client_id, msg):
1401 def db_query(self, client_id, msg):
1413 """Perform a raw query on the task record database."""
1402 """Perform a raw query on the task record database."""
1414 content = msg['content']
1403 content = msg['content']
1415 query = extract_dates(content.get('query', {}))
1404 query = extract_dates(content.get('query', {}))
1416 keys = content.get('keys', None)
1405 keys = content.get('keys', None)
1417 buffers = []
1406 buffers = []
1418 empty = list()
1407 empty = list()
1419 try:
1408 try:
1420 records = self.db.find_records(query, keys)
1409 records = self.db.find_records(query, keys)
1421 except Exception as e:
1410 except Exception as e:
1422 content = error.wrap_exception()
1411 content = error.wrap_exception()
1423 self.log.exception("DB query failed")
1412 self.log.exception("DB query failed")
1424 else:
1413 else:
1425 # extract buffers from reply content:
1414 # extract buffers from reply content:
1426 if keys is not None:
1415 if keys is not None:
1427 buffer_lens = [] if 'buffers' in keys else None
1416 buffer_lens = [] if 'buffers' in keys else None
1428 result_buffer_lens = [] if 'result_buffers' in keys else None
1417 result_buffer_lens = [] if 'result_buffers' in keys else None
1429 else:
1418 else:
1430 buffer_lens = None
1419 buffer_lens = None
1431 result_buffer_lens = None
1420 result_buffer_lens = None
1432
1421
1433 for rec in records:
1422 for rec in records:
1434 # buffers may be None, so double check
1423 # buffers may be None, so double check
1435 b = rec.pop('buffers', empty) or empty
1424 b = rec.pop('buffers', empty) or empty
1436 if buffer_lens is not None:
1425 if buffer_lens is not None:
1437 buffer_lens.append(len(b))
1426 buffer_lens.append(len(b))
1438 buffers.extend(b)
1427 buffers.extend(b)
1439 rb = rec.pop('result_buffers', empty) or empty
1428 rb = rec.pop('result_buffers', empty) or empty
1440 if result_buffer_lens is not None:
1429 if result_buffer_lens is not None:
1441 result_buffer_lens.append(len(rb))
1430 result_buffer_lens.append(len(rb))
1442 buffers.extend(rb)
1431 buffers.extend(rb)
1443 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1432 content = dict(status='ok', records=records, buffer_lens=buffer_lens,
1444 result_buffer_lens=result_buffer_lens)
1433 result_buffer_lens=result_buffer_lens)
1445 # self.log.debug (content)
1434 # self.log.debug (content)
1446 self.session.send(self.query, "db_reply", content=content,
1435 self.session.send(self.query, "db_reply", content=content,
1447 parent=msg, ident=client_id,
1436 parent=msg, ident=client_id,
1448 buffers=buffers)
1437 buffers=buffers)
1449
1438
@@ -1,848 +1,849 b''
1 """The Python scheduler for rich scheduling.
1 """The Python scheduler for rich scheduling.
2
2
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
3 The Pure ZMQ scheduler does not allow routing schemes other than LRU,
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
4 nor does it check msg_id DAG dependencies. For those, a slightly slower
5 Python Scheduler exists.
5 Python Scheduler exists.
6 """
6 """
7
7
8 # Copyright (c) IPython Development Team.
8 # Copyright (c) IPython Development Team.
9 # Distributed under the terms of the Modified BSD License.
9 # Distributed under the terms of the Modified BSD License.
10
10
11 import logging
11 import logging
12 import sys
12 import sys
13 import time
13 import time
14
14
15 from collections import deque
15 from collections import deque
16 from datetime import datetime
16 from datetime import datetime
17 from random import randint, random
17 from random import randint, random
18 from types import FunctionType
18 from types import FunctionType
19
19
20 try:
20 try:
21 import numpy
21 import numpy
22 except ImportError:
22 except ImportError:
23 numpy = None
23 numpy = None
24
24
25 import zmq
25 import zmq
26 from zmq.eventloop import ioloop, zmqstream
26 from zmq.eventloop import ioloop, zmqstream
27
27
28 # local imports
28 # local imports
29 from IPython.external.decorator import decorator
29 from IPython.external.decorator import decorator
30 from IPython.config.application import Application
30 from IPython.config.application import Application
31 from IPython.config.loader import Config
31 from IPython.config.loader import Config
32 from IPython.utils.traitlets import Instance, Dict, List, Set, Integer, Enum, CBytes
32 from IPython.utils.traitlets import Instance, Dict, List, Set, Integer, Enum, CBytes
33 from IPython.utils.py3compat import cast_bytes
33 from IPython.utils.py3compat import cast_bytes
34
34
35 from IPython.parallel import error, util
35 from IPython.parallel import error, util
36 from IPython.parallel.factory import SessionFactory
36 from IPython.parallel.factory import SessionFactory
37 from IPython.parallel.util import connect_logger, local_logger
37 from IPython.parallel.util import connect_logger, local_logger
38
38
39 from .dependency import Dependency
39 from .dependency import Dependency
40
40
41 @decorator
41 @decorator
42 def logged(f,self,*args,**kwargs):
42 def logged(f,self,*args,**kwargs):
43 # print ("#--------------------")
43 # print ("#--------------------")
44 self.log.debug("scheduler::%s(*%s,**%s)", f.__name__, args, kwargs)
44 self.log.debug("scheduler::%s(*%s,**%s)", f.__name__, args, kwargs)
45 # print ("#--")
45 # print ("#--")
46 return f(self,*args, **kwargs)
46 return f(self,*args, **kwargs)
47
47
48 #----------------------------------------------------------------------
48 #----------------------------------------------------------------------
49 # Chooser functions
49 # Chooser functions
50 #----------------------------------------------------------------------
50 #----------------------------------------------------------------------
51
51
52 def plainrandom(loads):
52 def plainrandom(loads):
53 """Plain random pick."""
53 """Plain random pick."""
54 n = len(loads)
54 n = len(loads)
55 return randint(0,n-1)
55 return randint(0,n-1)
56
56
57 def lru(loads):
57 def lru(loads):
58 """Always pick the front of the line.
58 """Always pick the front of the line.
59
59
60 The content of `loads` is ignored.
60 The content of `loads` is ignored.
61
61
62 Assumes LRU ordering of loads, with oldest first.
62 Assumes LRU ordering of loads, with oldest first.
63 """
63 """
64 return 0
64 return 0
65
65
66 def twobin(loads):
66 def twobin(loads):
67 """Pick two at random, use the LRU of the two.
67 """Pick two at random, use the LRU of the two.
68
68
69 The content of loads is ignored.
69 The content of loads is ignored.
70
70
71 Assumes LRU ordering of loads, with oldest first.
71 Assumes LRU ordering of loads, with oldest first.
72 """
72 """
73 n = len(loads)
73 n = len(loads)
74 a = randint(0,n-1)
74 a = randint(0,n-1)
75 b = randint(0,n-1)
75 b = randint(0,n-1)
76 return min(a,b)
76 return min(a,b)
77
77
78 def weighted(loads):
78 def weighted(loads):
79 """Pick two at random using inverse load as weight.
79 """Pick two at random using inverse load as weight.
80
80
81 Return the less loaded of the two.
81 Return the less loaded of the two.
82 """
82 """
83 # weight 0 a million times more than 1:
83 # weight 0 a million times more than 1:
84 weights = 1./(1e-6+numpy.array(loads))
84 weights = 1./(1e-6+numpy.array(loads))
85 sums = weights.cumsum()
85 sums = weights.cumsum()
86 t = sums[-1]
86 t = sums[-1]
87 x = random()*t
87 x = random()*t
88 y = random()*t
88 y = random()*t
89 idx = 0
89 idx = 0
90 idy = 0
90 idy = 0
91 while sums[idx] < x:
91 while sums[idx] < x:
92 idx += 1
92 idx += 1
93 while sums[idy] < y:
93 while sums[idy] < y:
94 idy += 1
94 idy += 1
95 if weights[idy] > weights[idx]:
95 if weights[idy] > weights[idx]:
96 return idy
96 return idy
97 else:
97 else:
98 return idx
98 return idx
99
99
100 def leastload(loads):
100 def leastload(loads):
101 """Always choose the lowest load.
101 """Always choose the lowest load.
102
102
103 If the lowest load occurs more than once, the first
103 If the lowest load occurs more than once, the first
104 occurance will be used. If loads has LRU ordering, this means
104 occurance will be used. If loads has LRU ordering, this means
105 the LRU of those with the lowest load is chosen.
105 the LRU of those with the lowest load is chosen.
106 """
106 """
107 return loads.index(min(loads))
107 return loads.index(min(loads))
108
108
109 #---------------------------------------------------------------------
109 #---------------------------------------------------------------------
110 # Classes
110 # Classes
111 #---------------------------------------------------------------------
111 #---------------------------------------------------------------------
112
112
113
113
114 # store empty default dependency:
114 # store empty default dependency:
115 MET = Dependency([])
115 MET = Dependency([])
116
116
117
117
118 class Job(object):
118 class Job(object):
119 """Simple container for a job"""
119 """Simple container for a job"""
120 def __init__(self, msg_id, raw_msg, idents, msg, header, metadata,
120 def __init__(self, msg_id, raw_msg, idents, msg, header, metadata,
121 targets, after, follow, timeout):
121 targets, after, follow, timeout):
122 self.msg_id = msg_id
122 self.msg_id = msg_id
123 self.raw_msg = raw_msg
123 self.raw_msg = raw_msg
124 self.idents = idents
124 self.idents = idents
125 self.msg = msg
125 self.msg = msg
126 self.header = header
126 self.header = header
127 self.metadata = metadata
127 self.metadata = metadata
128 self.targets = targets
128 self.targets = targets
129 self.after = after
129 self.after = after
130 self.follow = follow
130 self.follow = follow
131 self.timeout = timeout
131 self.timeout = timeout
132
132
133 self.removed = False # used for lazy-delete from sorted queue
133 self.removed = False # used for lazy-delete from sorted queue
134 self.timestamp = time.time()
134 self.timestamp = time.time()
135 self.timeout_id = 0
135 self.timeout_id = 0
136 self.blacklist = set()
136 self.blacklist = set()
137
137
138 def __lt__(self, other):
138 def __lt__(self, other):
139 return self.timestamp < other.timestamp
139 return self.timestamp < other.timestamp
140
140
141 def __cmp__(self, other):
141 def __cmp__(self, other):
142 return cmp(self.timestamp, other.timestamp)
142 return cmp(self.timestamp, other.timestamp)
143
143
144 @property
144 @property
145 def dependents(self):
145 def dependents(self):
146 return self.follow.union(self.after)
146 return self.follow.union(self.after)
147
147
148
148
149 class TaskScheduler(SessionFactory):
149 class TaskScheduler(SessionFactory):
150 """Python TaskScheduler object.
150 """Python TaskScheduler object.
151
151
152 This is the simplest object that supports msg_id based
152 This is the simplest object that supports msg_id based
153 DAG dependencies. *Only* task msg_ids are checked, not
153 DAG dependencies. *Only* task msg_ids are checked, not
154 msg_ids of jobs submitted via the MUX queue.
154 msg_ids of jobs submitted via the MUX queue.
155
155
156 """
156 """
157
157
158 hwm = Integer(1, config=True,
158 hwm = Integer(1, config=True,
159 help="""specify the High Water Mark (HWM) for the downstream
159 help="""specify the High Water Mark (HWM) for the downstream
160 socket in the Task scheduler. This is the maximum number
160 socket in the Task scheduler. This is the maximum number
161 of allowed outstanding tasks on each engine.
161 of allowed outstanding tasks on each engine.
162
162
163 The default (1) means that only one task can be outstanding on each
163 The default (1) means that only one task can be outstanding on each
164 engine. Setting TaskScheduler.hwm=0 means there is no limit, and the
164 engine. Setting TaskScheduler.hwm=0 means there is no limit, and the
165 engines continue to be assigned tasks while they are working,
165 engines continue to be assigned tasks while they are working,
166 effectively hiding network latency behind computation, but can result
166 effectively hiding network latency behind computation, but can result
167 in an imbalance of work when submitting many heterogenous tasks all at
167 in an imbalance of work when submitting many heterogenous tasks all at
168 once. Any positive value greater than one is a compromise between the
168 once. Any positive value greater than one is a compromise between the
169 two.
169 two.
170
170
171 """
171 """
172 )
172 )
173 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
173 scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'),
174 'leastload', config=True, allow_none=False,
174 'leastload', config=True, allow_none=False,
175 help="""select the task scheduler scheme [default: Python LRU]
175 help="""select the task scheduler scheme [default: Python LRU]
176 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
176 Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'"""
177 )
177 )
178 def _scheme_name_changed(self, old, new):
178 def _scheme_name_changed(self, old, new):
179 self.log.debug("Using scheme %r"%new)
179 self.log.debug("Using scheme %r"%new)
180 self.scheme = globals()[new]
180 self.scheme = globals()[new]
181
181
182 # input arguments:
182 # input arguments:
183 scheme = Instance(FunctionType) # function for determining the destination
183 scheme = Instance(FunctionType) # function for determining the destination
184 def _scheme_default(self):
184 def _scheme_default(self):
185 return leastload
185 return leastload
186 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
186 client_stream = Instance(zmqstream.ZMQStream) # client-facing stream
187 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
187 engine_stream = Instance(zmqstream.ZMQStream) # engine-facing stream
188 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
188 notifier_stream = Instance(zmqstream.ZMQStream) # hub-facing sub stream
189 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
189 mon_stream = Instance(zmqstream.ZMQStream) # hub-facing pub stream
190 query_stream = Instance(zmqstream.ZMQStream) # hub-facing DEALER stream
190 query_stream = Instance(zmqstream.ZMQStream) # hub-facing DEALER stream
191
191
192 # internals:
192 # internals:
193 queue = Instance(deque) # sorted list of Jobs
193 queue = Instance(deque) # sorted list of Jobs
194 def _queue_default(self):
194 def _queue_default(self):
195 return deque()
195 return deque()
196 queue_map = Dict() # dict by msg_id of Jobs (for O(1) access to the Queue)
196 queue_map = Dict() # dict by msg_id of Jobs (for O(1) access to the Queue)
197 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
197 graph = Dict() # dict by msg_id of [ msg_ids that depend on key ]
198 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
198 retries = Dict() # dict by msg_id of retries remaining (non-neg ints)
199 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
199 # waiting = List() # list of msg_ids ready to run, but haven't due to HWM
200 pending = Dict() # dict by engine_uuid of submitted tasks
200 pending = Dict() # dict by engine_uuid of submitted tasks
201 completed = Dict() # dict by engine_uuid of completed tasks
201 completed = Dict() # dict by engine_uuid of completed tasks
202 failed = Dict() # dict by engine_uuid of failed tasks
202 failed = Dict() # dict by engine_uuid of failed tasks
203 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
203 destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed)
204 clients = Dict() # dict by msg_id for who submitted the task
204 clients = Dict() # dict by msg_id for who submitted the task
205 targets = List() # list of target IDENTs
205 targets = List() # list of target IDENTs
206 loads = List() # list of engine loads
206 loads = List() # list of engine loads
207 # full = Set() # set of IDENTs that have HWM outstanding tasks
207 # full = Set() # set of IDENTs that have HWM outstanding tasks
208 all_completed = Set() # set of all completed tasks
208 all_completed = Set() # set of all completed tasks
209 all_failed = Set() # set of all failed tasks
209 all_failed = Set() # set of all failed tasks
210 all_done = Set() # set of all finished tasks=union(completed,failed)
210 all_done = Set() # set of all finished tasks=union(completed,failed)
211 all_ids = Set() # set of all submitted task IDs
211 all_ids = Set() # set of all submitted task IDs
212
212
213 ident = CBytes() # ZMQ identity. This should just be self.session.session
213 ident = CBytes() # ZMQ identity. This should just be self.session.session
214 # but ensure Bytes
214 # but ensure Bytes
215 def _ident_default(self):
215 def _ident_default(self):
216 return self.session.bsession
216 return self.session.bsession
217
217
218 def start(self):
218 def start(self):
219 self.query_stream.on_recv(self.dispatch_query_reply)
219 self.query_stream.on_recv(self.dispatch_query_reply)
220 self.session.send(self.query_stream, "connection_request", {})
220 self.session.send(self.query_stream, "connection_request", {})
221
221
222 self.engine_stream.on_recv(self.dispatch_result, copy=False)
222 self.engine_stream.on_recv(self.dispatch_result, copy=False)
223 self.client_stream.on_recv(self.dispatch_submission, copy=False)
223 self.client_stream.on_recv(self.dispatch_submission, copy=False)
224
224
225 self._notification_handlers = dict(
225 self._notification_handlers = dict(
226 registration_notification = self._register_engine,
226 registration_notification = self._register_engine,
227 unregistration_notification = self._unregister_engine
227 unregistration_notification = self._unregister_engine
228 )
228 )
229 self.notifier_stream.on_recv(self.dispatch_notification)
229 self.notifier_stream.on_recv(self.dispatch_notification)
230 self.log.info("Scheduler started [%s]" % self.scheme_name)
230 self.log.info("Scheduler started [%s]" % self.scheme_name)
231
231
232 def resume_receiving(self):
232 def resume_receiving(self):
233 """Resume accepting jobs."""
233 """Resume accepting jobs."""
234 self.client_stream.on_recv(self.dispatch_submission, copy=False)
234 self.client_stream.on_recv(self.dispatch_submission, copy=False)
235
235
236 def stop_receiving(self):
236 def stop_receiving(self):
237 """Stop accepting jobs while there are no engines.
237 """Stop accepting jobs while there are no engines.
238 Leave them in the ZMQ queue."""
238 Leave them in the ZMQ queue."""
239 self.client_stream.on_recv(None)
239 self.client_stream.on_recv(None)
240
240
241 #-----------------------------------------------------------------------
241 #-----------------------------------------------------------------------
242 # [Un]Registration Handling
242 # [Un]Registration Handling
243 #-----------------------------------------------------------------------
243 #-----------------------------------------------------------------------
244
244
245
245
246 def dispatch_query_reply(self, msg):
246 def dispatch_query_reply(self, msg):
247 """handle reply to our initial connection request"""
247 """handle reply to our initial connection request"""
248 try:
248 try:
249 idents,msg = self.session.feed_identities(msg)
249 idents,msg = self.session.feed_identities(msg)
250 except ValueError:
250 except ValueError:
251 self.log.warn("task::Invalid Message: %r",msg)
251 self.log.warn("task::Invalid Message: %r",msg)
252 return
252 return
253 try:
253 try:
254 msg = self.session.unserialize(msg)
254 msg = self.session.unserialize(msg)
255 except ValueError:
255 except ValueError:
256 self.log.warn("task::Unauthorized message from: %r"%idents)
256 self.log.warn("task::Unauthorized message from: %r"%idents)
257 return
257 return
258
258
259 content = msg['content']
259 content = msg['content']
260 for uuid in content.get('engines', {}).values():
260 for uuid in content.get('engines', {}).values():
261 self._register_engine(cast_bytes(uuid))
261 self._register_engine(cast_bytes(uuid))
262
262
263
263
264 @util.log_errors
264 @util.log_errors
265 def dispatch_notification(self, msg):
265 def dispatch_notification(self, msg):
266 """dispatch register/unregister events."""
266 """dispatch register/unregister events."""
267 try:
267 try:
268 idents,msg = self.session.feed_identities(msg)
268 idents,msg = self.session.feed_identities(msg)
269 except ValueError:
269 except ValueError:
270 self.log.warn("task::Invalid Message: %r",msg)
270 self.log.warn("task::Invalid Message: %r",msg)
271 return
271 return
272 try:
272 try:
273 msg = self.session.unserialize(msg)
273 msg = self.session.unserialize(msg)
274 except ValueError:
274 except ValueError:
275 self.log.warn("task::Unauthorized message from: %r"%idents)
275 self.log.warn("task::Unauthorized message from: %r"%idents)
276 return
276 return
277
277
278 msg_type = msg['header']['msg_type']
278 msg_type = msg['header']['msg_type']
279
279
280 handler = self._notification_handlers.get(msg_type, None)
280 handler = self._notification_handlers.get(msg_type, None)
281 if handler is None:
281 if handler is None:
282 self.log.error("Unhandled message type: %r"%msg_type)
282 self.log.error("Unhandled message type: %r"%msg_type)
283 else:
283 else:
284 try:
284 try:
285 handler(cast_bytes(msg['content']['uuid']))
285 handler(cast_bytes(msg['content']['uuid']))
286 except Exception:
286 except Exception:
287 self.log.error("task::Invalid notification msg: %r", msg, exc_info=True)
287 self.log.error("task::Invalid notification msg: %r", msg, exc_info=True)
288
288
289 def _register_engine(self, uid):
289 def _register_engine(self, uid):
290 """New engine with ident `uid` became available."""
290 """New engine with ident `uid` became available."""
291 # head of the line:
291 # head of the line:
292 self.targets.insert(0,uid)
292 self.targets.insert(0,uid)
293 self.loads.insert(0,0)
293 self.loads.insert(0,0)
294
294
295 # initialize sets
295 # initialize sets
296 self.completed[uid] = set()
296 self.completed[uid] = set()
297 self.failed[uid] = set()
297 self.failed[uid] = set()
298 self.pending[uid] = {}
298 self.pending[uid] = {}
299
299
300 # rescan the graph:
300 # rescan the graph:
301 self.update_graph(None)
301 self.update_graph(None)
302
302
303 def _unregister_engine(self, uid):
303 def _unregister_engine(self, uid):
304 """Existing engine with ident `uid` became unavailable."""
304 """Existing engine with ident `uid` became unavailable."""
305 if len(self.targets) == 1:
305 if len(self.targets) == 1:
306 # this was our only engine
306 # this was our only engine
307 pass
307 pass
308
308
309 # handle any potentially finished tasks:
309 # handle any potentially finished tasks:
310 self.engine_stream.flush()
310 self.engine_stream.flush()
311
311
312 # don't pop destinations, because they might be used later
312 # don't pop destinations, because they might be used later
313 # map(self.destinations.pop, self.completed.pop(uid))
313 # map(self.destinations.pop, self.completed.pop(uid))
314 # map(self.destinations.pop, self.failed.pop(uid))
314 # map(self.destinations.pop, self.failed.pop(uid))
315
315
316 # prevent this engine from receiving work
316 # prevent this engine from receiving work
317 idx = self.targets.index(uid)
317 idx = self.targets.index(uid)
318 self.targets.pop(idx)
318 self.targets.pop(idx)
319 self.loads.pop(idx)
319 self.loads.pop(idx)
320
320
321 # wait 5 seconds before cleaning up pending jobs, since the results might
321 # wait 5 seconds before cleaning up pending jobs, since the results might
322 # still be incoming
322 # still be incoming
323 if self.pending[uid]:
323 if self.pending[uid]:
324 dc = ioloop.DelayedCallback(lambda : self.handle_stranded_tasks(uid), 5000, self.loop)
324 self.loop.add_timeout(self.loop.time() + 5,
325 dc.start()
325 lambda : self.handle_stranded_tasks(uid),
326 )
326 else:
327 else:
327 self.completed.pop(uid)
328 self.completed.pop(uid)
328 self.failed.pop(uid)
329 self.failed.pop(uid)
329
330
330
331
331 def handle_stranded_tasks(self, engine):
332 def handle_stranded_tasks(self, engine):
332 """Deal with jobs resident in an engine that died."""
333 """Deal with jobs resident in an engine that died."""
333 lost = self.pending[engine]
334 lost = self.pending[engine]
334 for msg_id in lost.keys():
335 for msg_id in lost.keys():
335 if msg_id not in self.pending[engine]:
336 if msg_id not in self.pending[engine]:
336 # prevent double-handling of messages
337 # prevent double-handling of messages
337 continue
338 continue
338
339
339 raw_msg = lost[msg_id].raw_msg
340 raw_msg = lost[msg_id].raw_msg
340 idents,msg = self.session.feed_identities(raw_msg, copy=False)
341 idents,msg = self.session.feed_identities(raw_msg, copy=False)
341 parent = self.session.unpack(msg[1].bytes)
342 parent = self.session.unpack(msg[1].bytes)
342 idents = [engine, idents[0]]
343 idents = [engine, idents[0]]
343
344
344 # build fake error reply
345 # build fake error reply
345 try:
346 try:
346 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
347 raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id))
347 except:
348 except:
348 content = error.wrap_exception()
349 content = error.wrap_exception()
349 # build fake metadata
350 # build fake metadata
350 md = dict(
351 md = dict(
351 status=u'error',
352 status=u'error',
352 engine=engine.decode('ascii'),
353 engine=engine.decode('ascii'),
353 date=datetime.now(),
354 date=datetime.now(),
354 )
355 )
355 msg = self.session.msg('apply_reply', content, parent=parent, metadata=md)
356 msg = self.session.msg('apply_reply', content, parent=parent, metadata=md)
356 raw_reply = list(map(zmq.Message, self.session.serialize(msg, ident=idents)))
357 raw_reply = list(map(zmq.Message, self.session.serialize(msg, ident=idents)))
357 # and dispatch it
358 # and dispatch it
358 self.dispatch_result(raw_reply)
359 self.dispatch_result(raw_reply)
359
360
360 # finally scrub completed/failed lists
361 # finally scrub completed/failed lists
361 self.completed.pop(engine)
362 self.completed.pop(engine)
362 self.failed.pop(engine)
363 self.failed.pop(engine)
363
364
364
365
365 #-----------------------------------------------------------------------
366 #-----------------------------------------------------------------------
366 # Job Submission
367 # Job Submission
367 #-----------------------------------------------------------------------
368 #-----------------------------------------------------------------------
368
369
369
370
370 @util.log_errors
371 @util.log_errors
371 def dispatch_submission(self, raw_msg):
372 def dispatch_submission(self, raw_msg):
372 """Dispatch job submission to appropriate handlers."""
373 """Dispatch job submission to appropriate handlers."""
373 # ensure targets up to date:
374 # ensure targets up to date:
374 self.notifier_stream.flush()
375 self.notifier_stream.flush()
375 try:
376 try:
376 idents, msg = self.session.feed_identities(raw_msg, copy=False)
377 idents, msg = self.session.feed_identities(raw_msg, copy=False)
377 msg = self.session.unserialize(msg, content=False, copy=False)
378 msg = self.session.unserialize(msg, content=False, copy=False)
378 except Exception:
379 except Exception:
379 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
380 self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True)
380 return
381 return
381
382
382
383
383 # send to monitor
384 # send to monitor
384 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
385 self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False)
385
386
386 header = msg['header']
387 header = msg['header']
387 md = msg['metadata']
388 md = msg['metadata']
388 msg_id = header['msg_id']
389 msg_id = header['msg_id']
389 self.all_ids.add(msg_id)
390 self.all_ids.add(msg_id)
390
391
391 # get targets as a set of bytes objects
392 # get targets as a set of bytes objects
392 # from a list of unicode objects
393 # from a list of unicode objects
393 targets = md.get('targets', [])
394 targets = md.get('targets', [])
394 targets = set(map(cast_bytes, targets))
395 targets = set(map(cast_bytes, targets))
395
396
396 retries = md.get('retries', 0)
397 retries = md.get('retries', 0)
397 self.retries[msg_id] = retries
398 self.retries[msg_id] = retries
398
399
399 # time dependencies
400 # time dependencies
400 after = md.get('after', None)
401 after = md.get('after', None)
401 if after:
402 if after:
402 after = Dependency(after)
403 after = Dependency(after)
403 if after.all:
404 if after.all:
404 if after.success:
405 if after.success:
405 after = Dependency(after.difference(self.all_completed),
406 after = Dependency(after.difference(self.all_completed),
406 success=after.success,
407 success=after.success,
407 failure=after.failure,
408 failure=after.failure,
408 all=after.all,
409 all=after.all,
409 )
410 )
410 if after.failure:
411 if after.failure:
411 after = Dependency(after.difference(self.all_failed),
412 after = Dependency(after.difference(self.all_failed),
412 success=after.success,
413 success=after.success,
413 failure=after.failure,
414 failure=after.failure,
414 all=after.all,
415 all=after.all,
415 )
416 )
416 if after.check(self.all_completed, self.all_failed):
417 if after.check(self.all_completed, self.all_failed):
417 # recast as empty set, if `after` already met,
418 # recast as empty set, if `after` already met,
418 # to prevent unnecessary set comparisons
419 # to prevent unnecessary set comparisons
419 after = MET
420 after = MET
420 else:
421 else:
421 after = MET
422 after = MET
422
423
423 # location dependencies
424 # location dependencies
424 follow = Dependency(md.get('follow', []))
425 follow = Dependency(md.get('follow', []))
425
426
426 timeout = md.get('timeout', None)
427 timeout = md.get('timeout', None)
427 if timeout:
428 if timeout:
428 timeout = float(timeout)
429 timeout = float(timeout)
429
430
430 job = Job(msg_id=msg_id, raw_msg=raw_msg, idents=idents, msg=msg,
431 job = Job(msg_id=msg_id, raw_msg=raw_msg, idents=idents, msg=msg,
431 header=header, targets=targets, after=after, follow=follow,
432 header=header, targets=targets, after=after, follow=follow,
432 timeout=timeout, metadata=md,
433 timeout=timeout, metadata=md,
433 )
434 )
434 # validate and reduce dependencies:
435 # validate and reduce dependencies:
435 for dep in after,follow:
436 for dep in after,follow:
436 if not dep: # empty dependency
437 if not dep: # empty dependency
437 continue
438 continue
438 # check valid:
439 # check valid:
439 if msg_id in dep or dep.difference(self.all_ids):
440 if msg_id in dep or dep.difference(self.all_ids):
440 self.queue_map[msg_id] = job
441 self.queue_map[msg_id] = job
441 return self.fail_unreachable(msg_id, error.InvalidDependency)
442 return self.fail_unreachable(msg_id, error.InvalidDependency)
442 # check if unreachable:
443 # check if unreachable:
443 if dep.unreachable(self.all_completed, self.all_failed):
444 if dep.unreachable(self.all_completed, self.all_failed):
444 self.queue_map[msg_id] = job
445 self.queue_map[msg_id] = job
445 return self.fail_unreachable(msg_id)
446 return self.fail_unreachable(msg_id)
446
447
447 if after.check(self.all_completed, self.all_failed):
448 if after.check(self.all_completed, self.all_failed):
448 # time deps already met, try to run
449 # time deps already met, try to run
449 if not self.maybe_run(job):
450 if not self.maybe_run(job):
450 # can't run yet
451 # can't run yet
451 if msg_id not in self.all_failed:
452 if msg_id not in self.all_failed:
452 # could have failed as unreachable
453 # could have failed as unreachable
453 self.save_unmet(job)
454 self.save_unmet(job)
454 else:
455 else:
455 self.save_unmet(job)
456 self.save_unmet(job)
456
457
457 def job_timeout(self, job, timeout_id):
458 def job_timeout(self, job, timeout_id):
458 """callback for a job's timeout.
459 """callback for a job's timeout.
459
460
460 The job may or may not have been run at this point.
461 The job may or may not have been run at this point.
461 """
462 """
462 if job.timeout_id != timeout_id:
463 if job.timeout_id != timeout_id:
463 # not the most recent call
464 # not the most recent call
464 return
465 return
465 now = time.time()
466 now = time.time()
466 if job.timeout >= (now + 1):
467 if job.timeout >= (now + 1):
467 self.log.warn("task %s timeout fired prematurely: %s > %s",
468 self.log.warn("task %s timeout fired prematurely: %s > %s",
468 job.msg_id, job.timeout, now
469 job.msg_id, job.timeout, now
469 )
470 )
470 if job.msg_id in self.queue_map:
471 if job.msg_id in self.queue_map:
471 # still waiting, but ran out of time
472 # still waiting, but ran out of time
472 self.log.info("task %r timed out", job.msg_id)
473 self.log.info("task %r timed out", job.msg_id)
473 self.fail_unreachable(job.msg_id, error.TaskTimeout)
474 self.fail_unreachable(job.msg_id, error.TaskTimeout)
474
475
475 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
476 def fail_unreachable(self, msg_id, why=error.ImpossibleDependency):
476 """a task has become unreachable, send a reply with an ImpossibleDependency
477 """a task has become unreachable, send a reply with an ImpossibleDependency
477 error."""
478 error."""
478 if msg_id not in self.queue_map:
479 if msg_id not in self.queue_map:
479 self.log.error("task %r already failed!", msg_id)
480 self.log.error("task %r already failed!", msg_id)
480 return
481 return
481 job = self.queue_map.pop(msg_id)
482 job = self.queue_map.pop(msg_id)
482 # lazy-delete from the queue
483 # lazy-delete from the queue
483 job.removed = True
484 job.removed = True
484 for mid in job.dependents:
485 for mid in job.dependents:
485 if mid in self.graph:
486 if mid in self.graph:
486 self.graph[mid].remove(msg_id)
487 self.graph[mid].remove(msg_id)
487
488
488 try:
489 try:
489 raise why()
490 raise why()
490 except:
491 except:
491 content = error.wrap_exception()
492 content = error.wrap_exception()
492 self.log.debug("task %r failing as unreachable with: %s", msg_id, content['ename'])
493 self.log.debug("task %r failing as unreachable with: %s", msg_id, content['ename'])
493
494
494 self.all_done.add(msg_id)
495 self.all_done.add(msg_id)
495 self.all_failed.add(msg_id)
496 self.all_failed.add(msg_id)
496
497
497 msg = self.session.send(self.client_stream, 'apply_reply', content,
498 msg = self.session.send(self.client_stream, 'apply_reply', content,
498 parent=job.header, ident=job.idents)
499 parent=job.header, ident=job.idents)
499 self.session.send(self.mon_stream, msg, ident=[b'outtask']+job.idents)
500 self.session.send(self.mon_stream, msg, ident=[b'outtask']+job.idents)
500
501
501 self.update_graph(msg_id, success=False)
502 self.update_graph(msg_id, success=False)
502
503
503 def available_engines(self):
504 def available_engines(self):
504 """return a list of available engine indices based on HWM"""
505 """return a list of available engine indices based on HWM"""
505 if not self.hwm:
506 if not self.hwm:
506 return list(range(len(self.targets)))
507 return list(range(len(self.targets)))
507 available = []
508 available = []
508 for idx in range(len(self.targets)):
509 for idx in range(len(self.targets)):
509 if self.loads[idx] < self.hwm:
510 if self.loads[idx] < self.hwm:
510 available.append(idx)
511 available.append(idx)
511 return available
512 return available
512
513
513 def maybe_run(self, job):
514 def maybe_run(self, job):
514 """check location dependencies, and run if they are met."""
515 """check location dependencies, and run if they are met."""
515 msg_id = job.msg_id
516 msg_id = job.msg_id
516 self.log.debug("Attempting to assign task %s", msg_id)
517 self.log.debug("Attempting to assign task %s", msg_id)
517 available = self.available_engines()
518 available = self.available_engines()
518 if not available:
519 if not available:
519 # no engines, definitely can't run
520 # no engines, definitely can't run
520 return False
521 return False
521
522
522 if job.follow or job.targets or job.blacklist or self.hwm:
523 if job.follow or job.targets or job.blacklist or self.hwm:
523 # we need a can_run filter
524 # we need a can_run filter
524 def can_run(idx):
525 def can_run(idx):
525 # check hwm
526 # check hwm
526 if self.hwm and self.loads[idx] == self.hwm:
527 if self.hwm and self.loads[idx] == self.hwm:
527 return False
528 return False
528 target = self.targets[idx]
529 target = self.targets[idx]
529 # check blacklist
530 # check blacklist
530 if target in job.blacklist:
531 if target in job.blacklist:
531 return False
532 return False
532 # check targets
533 # check targets
533 if job.targets and target not in job.targets:
534 if job.targets and target not in job.targets:
534 return False
535 return False
535 # check follow
536 # check follow
536 return job.follow.check(self.completed[target], self.failed[target])
537 return job.follow.check(self.completed[target], self.failed[target])
537
538
538 indices = list(filter(can_run, available))
539 indices = list(filter(can_run, available))
539
540
540 if not indices:
541 if not indices:
541 # couldn't run
542 # couldn't run
542 if job.follow.all:
543 if job.follow.all:
543 # check follow for impossibility
544 # check follow for impossibility
544 dests = set()
545 dests = set()
545 relevant = set()
546 relevant = set()
546 if job.follow.success:
547 if job.follow.success:
547 relevant = self.all_completed
548 relevant = self.all_completed
548 if job.follow.failure:
549 if job.follow.failure:
549 relevant = relevant.union(self.all_failed)
550 relevant = relevant.union(self.all_failed)
550 for m in job.follow.intersection(relevant):
551 for m in job.follow.intersection(relevant):
551 dests.add(self.destinations[m])
552 dests.add(self.destinations[m])
552 if len(dests) > 1:
553 if len(dests) > 1:
553 self.queue_map[msg_id] = job
554 self.queue_map[msg_id] = job
554 self.fail_unreachable(msg_id)
555 self.fail_unreachable(msg_id)
555 return False
556 return False
556 if job.targets:
557 if job.targets:
557 # check blacklist+targets for impossibility
558 # check blacklist+targets for impossibility
558 job.targets.difference_update(job.blacklist)
559 job.targets.difference_update(job.blacklist)
559 if not job.targets or not job.targets.intersection(self.targets):
560 if not job.targets or not job.targets.intersection(self.targets):
560 self.queue_map[msg_id] = job
561 self.queue_map[msg_id] = job
561 self.fail_unreachable(msg_id)
562 self.fail_unreachable(msg_id)
562 return False
563 return False
563 return False
564 return False
564 else:
565 else:
565 indices = None
566 indices = None
566
567
567 self.submit_task(job, indices)
568 self.submit_task(job, indices)
568 return True
569 return True
569
570
570 def save_unmet(self, job):
571 def save_unmet(self, job):
571 """Save a message for later submission when its dependencies are met."""
572 """Save a message for later submission when its dependencies are met."""
572 msg_id = job.msg_id
573 msg_id = job.msg_id
573 self.log.debug("Adding task %s to the queue", msg_id)
574 self.log.debug("Adding task %s to the queue", msg_id)
574 self.queue_map[msg_id] = job
575 self.queue_map[msg_id] = job
575 self.queue.append(job)
576 self.queue.append(job)
576 # track the ids in follow or after, but not those already finished
577 # track the ids in follow or after, but not those already finished
577 for dep_id in job.after.union(job.follow).difference(self.all_done):
578 for dep_id in job.after.union(job.follow).difference(self.all_done):
578 if dep_id not in self.graph:
579 if dep_id not in self.graph:
579 self.graph[dep_id] = set()
580 self.graph[dep_id] = set()
580 self.graph[dep_id].add(msg_id)
581 self.graph[dep_id].add(msg_id)
581
582
582 # schedule timeout callback
583 # schedule timeout callback
583 if job.timeout:
584 if job.timeout:
584 timeout_id = job.timeout_id = job.timeout_id + 1
585 timeout_id = job.timeout_id = job.timeout_id + 1
585 self.loop.add_timeout(time.time() + job.timeout,
586 self.loop.add_timeout(time.time() + job.timeout,
586 lambda : self.job_timeout(job, timeout_id)
587 lambda : self.job_timeout(job, timeout_id)
587 )
588 )
588
589
589
590
590 def submit_task(self, job, indices=None):
591 def submit_task(self, job, indices=None):
591 """Submit a task to any of a subset of our targets."""
592 """Submit a task to any of a subset of our targets."""
592 if indices:
593 if indices:
593 loads = [self.loads[i] for i in indices]
594 loads = [self.loads[i] for i in indices]
594 else:
595 else:
595 loads = self.loads
596 loads = self.loads
596 idx = self.scheme(loads)
597 idx = self.scheme(loads)
597 if indices:
598 if indices:
598 idx = indices[idx]
599 idx = indices[idx]
599 target = self.targets[idx]
600 target = self.targets[idx]
600 # print (target, map(str, msg[:3]))
601 # print (target, map(str, msg[:3]))
601 # send job to the engine
602 # send job to the engine
602 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
603 self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False)
603 self.engine_stream.send_multipart(job.raw_msg, copy=False)
604 self.engine_stream.send_multipart(job.raw_msg, copy=False)
604 # update load
605 # update load
605 self.add_job(idx)
606 self.add_job(idx)
606 self.pending[target][job.msg_id] = job
607 self.pending[target][job.msg_id] = job
607 # notify Hub
608 # notify Hub
608 content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii'))
609 content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii'))
609 self.session.send(self.mon_stream, 'task_destination', content=content,
610 self.session.send(self.mon_stream, 'task_destination', content=content,
610 ident=[b'tracktask',self.ident])
611 ident=[b'tracktask',self.ident])
611
612
612
613
613 #-----------------------------------------------------------------------
614 #-----------------------------------------------------------------------
614 # Result Handling
615 # Result Handling
615 #-----------------------------------------------------------------------
616 #-----------------------------------------------------------------------
616
617
617
618
618 @util.log_errors
619 @util.log_errors
619 def dispatch_result(self, raw_msg):
620 def dispatch_result(self, raw_msg):
620 """dispatch method for result replies"""
621 """dispatch method for result replies"""
621 try:
622 try:
622 idents,msg = self.session.feed_identities(raw_msg, copy=False)
623 idents,msg = self.session.feed_identities(raw_msg, copy=False)
623 msg = self.session.unserialize(msg, content=False, copy=False)
624 msg = self.session.unserialize(msg, content=False, copy=False)
624 engine = idents[0]
625 engine = idents[0]
625 try:
626 try:
626 idx = self.targets.index(engine)
627 idx = self.targets.index(engine)
627 except ValueError:
628 except ValueError:
628 pass # skip load-update for dead engines
629 pass # skip load-update for dead engines
629 else:
630 else:
630 self.finish_job(idx)
631 self.finish_job(idx)
631 except Exception:
632 except Exception:
632 self.log.error("task::Invalid result: %r", raw_msg, exc_info=True)
633 self.log.error("task::Invalid result: %r", raw_msg, exc_info=True)
633 return
634 return
634
635
635 md = msg['metadata']
636 md = msg['metadata']
636 parent = msg['parent_header']
637 parent = msg['parent_header']
637 if md.get('dependencies_met', True):
638 if md.get('dependencies_met', True):
638 success = (md['status'] == 'ok')
639 success = (md['status'] == 'ok')
639 msg_id = parent['msg_id']
640 msg_id = parent['msg_id']
640 retries = self.retries[msg_id]
641 retries = self.retries[msg_id]
641 if not success and retries > 0:
642 if not success and retries > 0:
642 # failed
643 # failed
643 self.retries[msg_id] = retries - 1
644 self.retries[msg_id] = retries - 1
644 self.handle_unmet_dependency(idents, parent)
645 self.handle_unmet_dependency(idents, parent)
645 else:
646 else:
646 del self.retries[msg_id]
647 del self.retries[msg_id]
647 # relay to client and update graph
648 # relay to client and update graph
648 self.handle_result(idents, parent, raw_msg, success)
649 self.handle_result(idents, parent, raw_msg, success)
649 # send to Hub monitor
650 # send to Hub monitor
650 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
651 self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False)
651 else:
652 else:
652 self.handle_unmet_dependency(idents, parent)
653 self.handle_unmet_dependency(idents, parent)
653
654
654 def handle_result(self, idents, parent, raw_msg, success=True):
655 def handle_result(self, idents, parent, raw_msg, success=True):
655 """handle a real task result, either success or failure"""
656 """handle a real task result, either success or failure"""
656 # first, relay result to client
657 # first, relay result to client
657 engine = idents[0]
658 engine = idents[0]
658 client = idents[1]
659 client = idents[1]
659 # swap_ids for ROUTER-ROUTER mirror
660 # swap_ids for ROUTER-ROUTER mirror
660 raw_msg[:2] = [client,engine]
661 raw_msg[:2] = [client,engine]
661 # print (map(str, raw_msg[:4]))
662 # print (map(str, raw_msg[:4]))
662 self.client_stream.send_multipart(raw_msg, copy=False)
663 self.client_stream.send_multipart(raw_msg, copy=False)
663 # now, update our data structures
664 # now, update our data structures
664 msg_id = parent['msg_id']
665 msg_id = parent['msg_id']
665 self.pending[engine].pop(msg_id)
666 self.pending[engine].pop(msg_id)
666 if success:
667 if success:
667 self.completed[engine].add(msg_id)
668 self.completed[engine].add(msg_id)
668 self.all_completed.add(msg_id)
669 self.all_completed.add(msg_id)
669 else:
670 else:
670 self.failed[engine].add(msg_id)
671 self.failed[engine].add(msg_id)
671 self.all_failed.add(msg_id)
672 self.all_failed.add(msg_id)
672 self.all_done.add(msg_id)
673 self.all_done.add(msg_id)
673 self.destinations[msg_id] = engine
674 self.destinations[msg_id] = engine
674
675
675 self.update_graph(msg_id, success)
676 self.update_graph(msg_id, success)
676
677
677 def handle_unmet_dependency(self, idents, parent):
678 def handle_unmet_dependency(self, idents, parent):
678 """handle an unmet dependency"""
679 """handle an unmet dependency"""
679 engine = idents[0]
680 engine = idents[0]
680 msg_id = parent['msg_id']
681 msg_id = parent['msg_id']
681
682
682 job = self.pending[engine].pop(msg_id)
683 job = self.pending[engine].pop(msg_id)
683 job.blacklist.add(engine)
684 job.blacklist.add(engine)
684
685
685 if job.blacklist == job.targets:
686 if job.blacklist == job.targets:
686 self.queue_map[msg_id] = job
687 self.queue_map[msg_id] = job
687 self.fail_unreachable(msg_id)
688 self.fail_unreachable(msg_id)
688 elif not self.maybe_run(job):
689 elif not self.maybe_run(job):
689 # resubmit failed
690 # resubmit failed
690 if msg_id not in self.all_failed:
691 if msg_id not in self.all_failed:
691 # put it back in our dependency tree
692 # put it back in our dependency tree
692 self.save_unmet(job)
693 self.save_unmet(job)
693
694
694 if self.hwm:
695 if self.hwm:
695 try:
696 try:
696 idx = self.targets.index(engine)
697 idx = self.targets.index(engine)
697 except ValueError:
698 except ValueError:
698 pass # skip load-update for dead engines
699 pass # skip load-update for dead engines
699 else:
700 else:
700 if self.loads[idx] == self.hwm-1:
701 if self.loads[idx] == self.hwm-1:
701 self.update_graph(None)
702 self.update_graph(None)
702
703
703 def update_graph(self, dep_id=None, success=True):
704 def update_graph(self, dep_id=None, success=True):
704 """dep_id just finished. Update our dependency
705 """dep_id just finished. Update our dependency
705 graph and submit any jobs that just became runnable.
706 graph and submit any jobs that just became runnable.
706
707
707 Called with dep_id=None to update entire graph for hwm, but without finishing a task.
708 Called with dep_id=None to update entire graph for hwm, but without finishing a task.
708 """
709 """
709 # print ("\n\n***********")
710 # print ("\n\n***********")
710 # pprint (dep_id)
711 # pprint (dep_id)
711 # pprint (self.graph)
712 # pprint (self.graph)
712 # pprint (self.queue_map)
713 # pprint (self.queue_map)
713 # pprint (self.all_completed)
714 # pprint (self.all_completed)
714 # pprint (self.all_failed)
715 # pprint (self.all_failed)
715 # print ("\n\n***********\n\n")
716 # print ("\n\n***********\n\n")
716 # update any jobs that depended on the dependency
717 # update any jobs that depended on the dependency
717 msg_ids = self.graph.pop(dep_id, [])
718 msg_ids = self.graph.pop(dep_id, [])
718
719
719 # recheck *all* jobs if
720 # recheck *all* jobs if
720 # a) we have HWM and an engine just become no longer full
721 # a) we have HWM and an engine just become no longer full
721 # or b) dep_id was given as None
722 # or b) dep_id was given as None
722
723
723 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
724 if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]):
724 jobs = self.queue
725 jobs = self.queue
725 using_queue = True
726 using_queue = True
726 else:
727 else:
727 using_queue = False
728 using_queue = False
728 jobs = deque(sorted( self.queue_map[msg_id] for msg_id in msg_ids ))
729 jobs = deque(sorted( self.queue_map[msg_id] for msg_id in msg_ids ))
729
730
730 to_restore = []
731 to_restore = []
731 while jobs:
732 while jobs:
732 job = jobs.popleft()
733 job = jobs.popleft()
733 if job.removed:
734 if job.removed:
734 continue
735 continue
735 msg_id = job.msg_id
736 msg_id = job.msg_id
736
737
737 put_it_back = True
738 put_it_back = True
738
739
739 if job.after.unreachable(self.all_completed, self.all_failed)\
740 if job.after.unreachable(self.all_completed, self.all_failed)\
740 or job.follow.unreachable(self.all_completed, self.all_failed):
741 or job.follow.unreachable(self.all_completed, self.all_failed):
741 self.fail_unreachable(msg_id)
742 self.fail_unreachable(msg_id)
742 put_it_back = False
743 put_it_back = False
743
744
744 elif job.after.check(self.all_completed, self.all_failed): # time deps met, maybe run
745 elif job.after.check(self.all_completed, self.all_failed): # time deps met, maybe run
745 if self.maybe_run(job):
746 if self.maybe_run(job):
746 put_it_back = False
747 put_it_back = False
747 self.queue_map.pop(msg_id)
748 self.queue_map.pop(msg_id)
748 for mid in job.dependents:
749 for mid in job.dependents:
749 if mid in self.graph:
750 if mid in self.graph:
750 self.graph[mid].remove(msg_id)
751 self.graph[mid].remove(msg_id)
751
752
752 # abort the loop if we just filled up all of our engines.
753 # abort the loop if we just filled up all of our engines.
753 # avoids an O(N) operation in situation of full queue,
754 # avoids an O(N) operation in situation of full queue,
754 # where graph update is triggered as soon as an engine becomes
755 # where graph update is triggered as soon as an engine becomes
755 # non-full, and all tasks after the first are checked,
756 # non-full, and all tasks after the first are checked,
756 # even though they can't run.
757 # even though they can't run.
757 if not self.available_engines():
758 if not self.available_engines():
758 break
759 break
759
760
760 if using_queue and put_it_back:
761 if using_queue and put_it_back:
761 # popped a job from the queue but it neither ran nor failed,
762 # popped a job from the queue but it neither ran nor failed,
762 # so we need to put it back when we are done
763 # so we need to put it back when we are done
763 # make sure to_restore preserves the same ordering
764 # make sure to_restore preserves the same ordering
764 to_restore.append(job)
765 to_restore.append(job)
765
766
766 # put back any tasks we popped but didn't run
767 # put back any tasks we popped but didn't run
767 if using_queue:
768 if using_queue:
768 self.queue.extendleft(to_restore)
769 self.queue.extendleft(to_restore)
769
770
770 #----------------------------------------------------------------------
771 #----------------------------------------------------------------------
771 # methods to be overridden by subclasses
772 # methods to be overridden by subclasses
772 #----------------------------------------------------------------------
773 #----------------------------------------------------------------------
773
774
774 def add_job(self, idx):
775 def add_job(self, idx):
775 """Called after self.targets[idx] just got the job with header.
776 """Called after self.targets[idx] just got the job with header.
776 Override with subclasses. The default ordering is simple LRU.
777 Override with subclasses. The default ordering is simple LRU.
777 The default loads are the number of outstanding jobs."""
778 The default loads are the number of outstanding jobs."""
778 self.loads[idx] += 1
779 self.loads[idx] += 1
779 for lis in (self.targets, self.loads):
780 for lis in (self.targets, self.loads):
780 lis.append(lis.pop(idx))
781 lis.append(lis.pop(idx))
781
782
782
783
783 def finish_job(self, idx):
784 def finish_job(self, idx):
784 """Called after self.targets[idx] just finished a job.
785 """Called after self.targets[idx] just finished a job.
785 Override with subclasses."""
786 Override with subclasses."""
786 self.loads[idx] -= 1
787 self.loads[idx] -= 1
787
788
788
789
789
790
790 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=None,
791 def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=None,
791 logname='root', log_url=None, loglevel=logging.DEBUG,
792 logname='root', log_url=None, loglevel=logging.DEBUG,
792 identity=b'task', in_thread=False):
793 identity=b'task', in_thread=False):
793
794
794 ZMQStream = zmqstream.ZMQStream
795 ZMQStream = zmqstream.ZMQStream
795
796
796 if config:
797 if config:
797 # unwrap dict back into Config
798 # unwrap dict back into Config
798 config = Config(config)
799 config = Config(config)
799
800
800 if in_thread:
801 if in_thread:
801 # use instance() to get the same Context/Loop as our parent
802 # use instance() to get the same Context/Loop as our parent
802 ctx = zmq.Context.instance()
803 ctx = zmq.Context.instance()
803 loop = ioloop.IOLoop.instance()
804 loop = ioloop.IOLoop.instance()
804 else:
805 else:
805 # in a process, don't use instance()
806 # in a process, don't use instance()
806 # for safety with multiprocessing
807 # for safety with multiprocessing
807 ctx = zmq.Context()
808 ctx = zmq.Context()
808 loop = ioloop.IOLoop()
809 loop = ioloop.IOLoop()
809 ins = ZMQStream(ctx.socket(zmq.ROUTER),loop)
810 ins = ZMQStream(ctx.socket(zmq.ROUTER),loop)
810 util.set_hwm(ins, 0)
811 util.set_hwm(ins, 0)
811 ins.setsockopt(zmq.IDENTITY, identity + b'_in')
812 ins.setsockopt(zmq.IDENTITY, identity + b'_in')
812 ins.bind(in_addr)
813 ins.bind(in_addr)
813
814
814 outs = ZMQStream(ctx.socket(zmq.ROUTER),loop)
815 outs = ZMQStream(ctx.socket(zmq.ROUTER),loop)
815 util.set_hwm(outs, 0)
816 util.set_hwm(outs, 0)
816 outs.setsockopt(zmq.IDENTITY, identity + b'_out')
817 outs.setsockopt(zmq.IDENTITY, identity + b'_out')
817 outs.bind(out_addr)
818 outs.bind(out_addr)
818 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
819 mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop)
819 util.set_hwm(mons, 0)
820 util.set_hwm(mons, 0)
820 mons.connect(mon_addr)
821 mons.connect(mon_addr)
821 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
822 nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop)
822 nots.setsockopt(zmq.SUBSCRIBE, b'')
823 nots.setsockopt(zmq.SUBSCRIBE, b'')
823 nots.connect(not_addr)
824 nots.connect(not_addr)
824
825
825 querys = ZMQStream(ctx.socket(zmq.DEALER),loop)
826 querys = ZMQStream(ctx.socket(zmq.DEALER),loop)
826 querys.connect(reg_addr)
827 querys.connect(reg_addr)
827
828
828 # setup logging.
829 # setup logging.
829 if in_thread:
830 if in_thread:
830 log = Application.instance().log
831 log = Application.instance().log
831 else:
832 else:
832 if log_url:
833 if log_url:
833 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
834 log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel)
834 else:
835 else:
835 log = local_logger(logname, loglevel)
836 log = local_logger(logname, loglevel)
836
837
837 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
838 scheduler = TaskScheduler(client_stream=ins, engine_stream=outs,
838 mon_stream=mons, notifier_stream=nots,
839 mon_stream=mons, notifier_stream=nots,
839 query_stream=querys,
840 query_stream=querys,
840 loop=loop, log=log,
841 loop=loop, log=log,
841 config=config)
842 config=config)
842 scheduler.start()
843 scheduler.start()
843 if not in_thread:
844 if not in_thread:
844 try:
845 try:
845 loop.start()
846 loop.start()
846 except KeyboardInterrupt:
847 except KeyboardInterrupt:
847 scheduler.log.critical("Interrupted, exiting...")
848 scheduler.log.critical("Interrupted, exiting...")
848
849
@@ -1,301 +1,301 b''
1 """A simple engine that talks to a controller over 0MQ.
1 """A simple engine that talks to a controller over 0MQ.
2 it handles registration, etc. and launches a kernel
2 it handles registration, etc. and launches a kernel
3 connected to the Controller's Schedulers.
3 connected to the Controller's Schedulers.
4 """
4 """
5
5
6 # Copyright (c) IPython Development Team.
6 # Copyright (c) IPython Development Team.
7 # Distributed under the terms of the Modified BSD License.
7 # Distributed under the terms of the Modified BSD License.
8
8
9 from __future__ import print_function
9 from __future__ import print_function
10
10
11 import sys
11 import sys
12 import time
12 import time
13 from getpass import getpass
13 from getpass import getpass
14
14
15 import zmq
15 import zmq
16 from zmq.eventloop import ioloop, zmqstream
16 from zmq.eventloop import ioloop, zmqstream
17
17
18 from IPython.utils.localinterfaces import localhost
18 from IPython.utils.localinterfaces import localhost
19 from IPython.utils.traitlets import (
19 from IPython.utils.traitlets import (
20 Instance, Dict, Integer, Type, Float, Unicode, CBytes, Bool
20 Instance, Dict, Integer, Type, Float, Unicode, CBytes, Bool
21 )
21 )
22 from IPython.utils.py3compat import cast_bytes
22 from IPython.utils.py3compat import cast_bytes
23
23
24 from IPython.parallel.controller.heartmonitor import Heart
24 from IPython.parallel.controller.heartmonitor import Heart
25 from IPython.parallel.factory import RegistrationFactory
25 from IPython.parallel.factory import RegistrationFactory
26 from IPython.parallel.util import disambiguate_url
26 from IPython.parallel.util import disambiguate_url
27
27
28 from IPython.kernel.zmq.session import Message
29 from IPython.kernel.zmq.ipkernel import IPythonKernel as Kernel
28 from IPython.kernel.zmq.ipkernel import IPythonKernel as Kernel
30 from IPython.kernel.zmq.kernelapp import IPKernelApp
29 from IPython.kernel.zmq.kernelapp import IPKernelApp
31
30
32 class EngineFactory(RegistrationFactory):
31 class EngineFactory(RegistrationFactory):
33 """IPython engine"""
32 """IPython engine"""
34
33
35 # configurables:
34 # configurables:
36 out_stream_factory=Type('IPython.kernel.zmq.iostream.OutStream', config=True,
35 out_stream_factory=Type('IPython.kernel.zmq.iostream.OutStream', config=True,
37 help="""The OutStream for handling stdout/err.
36 help="""The OutStream for handling stdout/err.
38 Typically 'IPython.kernel.zmq.iostream.OutStream'""")
37 Typically 'IPython.kernel.zmq.iostream.OutStream'""")
39 display_hook_factory=Type('IPython.kernel.zmq.displayhook.ZMQDisplayHook', config=True,
38 display_hook_factory=Type('IPython.kernel.zmq.displayhook.ZMQDisplayHook', config=True,
40 help="""The class for handling displayhook.
39 help="""The class for handling displayhook.
41 Typically 'IPython.kernel.zmq.displayhook.ZMQDisplayHook'""")
40 Typically 'IPython.kernel.zmq.displayhook.ZMQDisplayHook'""")
42 location=Unicode(config=True,
41 location=Unicode(config=True,
43 help="""The location (an IP address) of the controller. This is
42 help="""The location (an IP address) of the controller. This is
44 used for disambiguating URLs, to determine whether
43 used for disambiguating URLs, to determine whether
45 loopback should be used to connect or the public address.""")
44 loopback should be used to connect or the public address.""")
46 timeout=Float(5.0, config=True,
45 timeout=Float(5.0, config=True,
47 help="""The time (in seconds) to wait for the Controller to respond
46 help="""The time (in seconds) to wait for the Controller to respond
48 to registration requests before giving up.""")
47 to registration requests before giving up.""")
49 max_heartbeat_misses=Integer(50, config=True,
48 max_heartbeat_misses=Integer(50, config=True,
50 help="""The maximum number of times a check for the heartbeat ping of a
49 help="""The maximum number of times a check for the heartbeat ping of a
51 controller can be missed before shutting down the engine.
50 controller can be missed before shutting down the engine.
52
51
53 If set to 0, the check is disabled.""")
52 If set to 0, the check is disabled.""")
54 sshserver=Unicode(config=True,
53 sshserver=Unicode(config=True,
55 help="""The SSH server to use for tunneling connections to the Controller.""")
54 help="""The SSH server to use for tunneling connections to the Controller.""")
56 sshkey=Unicode(config=True,
55 sshkey=Unicode(config=True,
57 help="""The SSH private key file to use when tunneling connections to the Controller.""")
56 help="""The SSH private key file to use when tunneling connections to the Controller.""")
58 paramiko=Bool(sys.platform == 'win32', config=True,
57 paramiko=Bool(sys.platform == 'win32', config=True,
59 help="""Whether to use paramiko instead of openssh for tunnels.""")
58 help="""Whether to use paramiko instead of openssh for tunnels.""")
60
59
61 @property
60 @property
62 def tunnel_mod(self):
61 def tunnel_mod(self):
63 from zmq.ssh import tunnel
62 from zmq.ssh import tunnel
64 return tunnel
63 return tunnel
65
64
66
65
67 # not configurable:
66 # not configurable:
68 connection_info = Dict()
67 connection_info = Dict()
69 user_ns = Dict()
68 user_ns = Dict()
70 id = Integer(allow_none=True)
69 id = Integer(allow_none=True)
71 registrar = Instance('zmq.eventloop.zmqstream.ZMQStream')
70 registrar = Instance('zmq.eventloop.zmqstream.ZMQStream')
72 kernel = Instance(Kernel)
71 kernel = Instance(Kernel)
73 hb_check_period=Integer()
72 hb_check_period=Integer()
74
73
75 # States for the heartbeat monitoring
74 # States for the heartbeat monitoring
76 # Initial values for monitored and pinged must satisfy "monitored > pinged == False" so that
75 # Initial values for monitored and pinged must satisfy "monitored > pinged == False" so that
77 # during the first check no "missed" ping is reported. Must be floats for Python 3 compatibility.
76 # during the first check no "missed" ping is reported. Must be floats for Python 3 compatibility.
78 _hb_last_pinged = 0.0
77 _hb_last_pinged = 0.0
79 _hb_last_monitored = 0.0
78 _hb_last_monitored = 0.0
80 _hb_missed_beats = 0
79 _hb_missed_beats = 0
81 # The zmq Stream which receives the pings from the Heart
80 # The zmq Stream which receives the pings from the Heart
82 _hb_listener = None
81 _hb_listener = None
83
82
84 bident = CBytes()
83 bident = CBytes()
85 ident = Unicode()
84 ident = Unicode()
86 def _ident_changed(self, name, old, new):
85 def _ident_changed(self, name, old, new):
87 self.bident = cast_bytes(new)
86 self.bident = cast_bytes(new)
88 using_ssh=Bool(False)
87 using_ssh=Bool(False)
89
88
90
89
91 def __init__(self, **kwargs):
90 def __init__(self, **kwargs):
92 super(EngineFactory, self).__init__(**kwargs)
91 super(EngineFactory, self).__init__(**kwargs)
93 self.ident = self.session.session
92 self.ident = self.session.session
94
93
95 def init_connector(self):
94 def init_connector(self):
96 """construct connection function, which handles tunnels."""
95 """construct connection function, which handles tunnels."""
97 self.using_ssh = bool(self.sshkey or self.sshserver)
96 self.using_ssh = bool(self.sshkey or self.sshserver)
98
97
99 if self.sshkey and not self.sshserver:
98 if self.sshkey and not self.sshserver:
100 # We are using ssh directly to the controller, tunneling localhost to localhost
99 # We are using ssh directly to the controller, tunneling localhost to localhost
101 self.sshserver = self.url.split('://')[1].split(':')[0]
100 self.sshserver = self.url.split('://')[1].split(':')[0]
102
101
103 if self.using_ssh:
102 if self.using_ssh:
104 if self.tunnel_mod.try_passwordless_ssh(self.sshserver, self.sshkey, self.paramiko):
103 if self.tunnel_mod.try_passwordless_ssh(self.sshserver, self.sshkey, self.paramiko):
105 password=False
104 password=False
106 else:
105 else:
107 password = getpass("SSH Password for %s: "%self.sshserver)
106 password = getpass("SSH Password for %s: "%self.sshserver)
108 else:
107 else:
109 password = False
108 password = False
110
109
111 def connect(s, url):
110 def connect(s, url):
112 url = disambiguate_url(url, self.location)
111 url = disambiguate_url(url, self.location)
113 if self.using_ssh:
112 if self.using_ssh:
114 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
113 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
115 return self.tunnel_mod.tunnel_connection(s, url, self.sshserver,
114 return self.tunnel_mod.tunnel_connection(s, url, self.sshserver,
116 keyfile=self.sshkey, paramiko=self.paramiko,
115 keyfile=self.sshkey, paramiko=self.paramiko,
117 password=password,
116 password=password,
118 )
117 )
119 else:
118 else:
120 return s.connect(url)
119 return s.connect(url)
121
120
122 def maybe_tunnel(url):
121 def maybe_tunnel(url):
123 """like connect, but don't complete the connection (for use by heartbeat)"""
122 """like connect, but don't complete the connection (for use by heartbeat)"""
124 url = disambiguate_url(url, self.location)
123 url = disambiguate_url(url, self.location)
125 if self.using_ssh:
124 if self.using_ssh:
126 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
125 self.log.debug("Tunneling connection to %s via %s", url, self.sshserver)
127 url, tunnelobj = self.tunnel_mod.open_tunnel(url, self.sshserver,
126 url, tunnelobj = self.tunnel_mod.open_tunnel(url, self.sshserver,
128 keyfile=self.sshkey, paramiko=self.paramiko,
127 keyfile=self.sshkey, paramiko=self.paramiko,
129 password=password,
128 password=password,
130 )
129 )
131 return str(url)
130 return str(url)
132 return connect, maybe_tunnel
131 return connect, maybe_tunnel
133
132
134 def register(self):
133 def register(self):
135 """send the registration_request"""
134 """send the registration_request"""
136
135
137 self.log.info("Registering with controller at %s"%self.url)
136 self.log.info("Registering with controller at %s"%self.url)
138 ctx = self.context
137 ctx = self.context
139 connect,maybe_tunnel = self.init_connector()
138 connect,maybe_tunnel = self.init_connector()
140 reg = ctx.socket(zmq.DEALER)
139 reg = ctx.socket(zmq.DEALER)
141 reg.setsockopt(zmq.IDENTITY, self.bident)
140 reg.setsockopt(zmq.IDENTITY, self.bident)
142 connect(reg, self.url)
141 connect(reg, self.url)
143 self.registrar = zmqstream.ZMQStream(reg, self.loop)
142 self.registrar = zmqstream.ZMQStream(reg, self.loop)
144
143
145
144
146 content = dict(uuid=self.ident)
145 content = dict(uuid=self.ident)
147 self.registrar.on_recv(lambda msg: self.complete_registration(msg, connect, maybe_tunnel))
146 self.registrar.on_recv(lambda msg: self.complete_registration(msg, connect, maybe_tunnel))
148 # print (self.session.key)
147 # print (self.session.key)
149 self.session.send(self.registrar, "registration_request", content=content)
148 self.session.send(self.registrar, "registration_request", content=content)
150
149
151 def _report_ping(self, msg):
150 def _report_ping(self, msg):
152 """Callback for when the heartmonitor.Heart receives a ping"""
151 """Callback for when the heartmonitor.Heart receives a ping"""
153 #self.log.debug("Received a ping: %s", msg)
152 #self.log.debug("Received a ping: %s", msg)
154 self._hb_last_pinged = time.time()
153 self._hb_last_pinged = time.time()
155
154
156 def complete_registration(self, msg, connect, maybe_tunnel):
155 def complete_registration(self, msg, connect, maybe_tunnel):
157 # print msg
156 # print msg
158 self._abort_dc.stop()
157 self.loop.remove_timeout(self._abort_dc)
159 ctx = self.context
158 ctx = self.context
160 loop = self.loop
159 loop = self.loop
161 identity = self.bident
160 identity = self.bident
162 idents,msg = self.session.feed_identities(msg)
161 idents,msg = self.session.feed_identities(msg)
163 msg = self.session.unserialize(msg)
162 msg = self.session.unserialize(msg)
164 content = msg['content']
163 content = msg['content']
165 info = self.connection_info
164 info = self.connection_info
166
165
167 def url(key):
166 def url(key):
168 """get zmq url for given channel"""
167 """get zmq url for given channel"""
169 return str(info["interface"] + ":%i" % info[key])
168 return str(info["interface"] + ":%i" % info[key])
170
169
171 if content['status'] == 'ok':
170 if content['status'] == 'ok':
172 self.id = int(content['id'])
171 self.id = int(content['id'])
173
172
174 # launch heartbeat
173 # launch heartbeat
175 # possibly forward hb ports with tunnels
174 # possibly forward hb ports with tunnels
176 hb_ping = maybe_tunnel(url('hb_ping'))
175 hb_ping = maybe_tunnel(url('hb_ping'))
177 hb_pong = maybe_tunnel(url('hb_pong'))
176 hb_pong = maybe_tunnel(url('hb_pong'))
178
177
179 hb_monitor = None
178 hb_monitor = None
180 if self.max_heartbeat_misses > 0:
179 if self.max_heartbeat_misses > 0:
181 # Add a monitor socket which will record the last time a ping was seen
180 # Add a monitor socket which will record the last time a ping was seen
182 mon = self.context.socket(zmq.SUB)
181 mon = self.context.socket(zmq.SUB)
183 mport = mon.bind_to_random_port('tcp://%s' % localhost())
182 mport = mon.bind_to_random_port('tcp://%s' % localhost())
184 mon.setsockopt(zmq.SUBSCRIBE, b"")
183 mon.setsockopt(zmq.SUBSCRIBE, b"")
185 self._hb_listener = zmqstream.ZMQStream(mon, self.loop)
184 self._hb_listener = zmqstream.ZMQStream(mon, self.loop)
186 self._hb_listener.on_recv(self._report_ping)
185 self._hb_listener.on_recv(self._report_ping)
187
186
188
187
189 hb_monitor = "tcp://%s:%i" % (localhost(), mport)
188 hb_monitor = "tcp://%s:%i" % (localhost(), mport)
190
189
191 heart = Heart(hb_ping, hb_pong, hb_monitor , heart_id=identity)
190 heart = Heart(hb_ping, hb_pong, hb_monitor , heart_id=identity)
192 heart.start()
191 heart.start()
193
192
194 # create Shell Connections (MUX, Task, etc.):
193 # create Shell Connections (MUX, Task, etc.):
195 shell_addrs = url('mux'), url('task')
194 shell_addrs = url('mux'), url('task')
196
195
197 # Use only one shell stream for mux and tasks
196 # Use only one shell stream for mux and tasks
198 stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
197 stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
199 stream.setsockopt(zmq.IDENTITY, identity)
198 stream.setsockopt(zmq.IDENTITY, identity)
200 shell_streams = [stream]
199 shell_streams = [stream]
201 for addr in shell_addrs:
200 for addr in shell_addrs:
202 connect(stream, addr)
201 connect(stream, addr)
203
202
204 # control stream:
203 # control stream:
205 control_addr = url('control')
204 control_addr = url('control')
206 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
205 control_stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop)
207 control_stream.setsockopt(zmq.IDENTITY, identity)
206 control_stream.setsockopt(zmq.IDENTITY, identity)
208 connect(control_stream, control_addr)
207 connect(control_stream, control_addr)
209
208
210 # create iopub stream:
209 # create iopub stream:
211 iopub_addr = url('iopub')
210 iopub_addr = url('iopub')
212 iopub_socket = ctx.socket(zmq.PUB)
211 iopub_socket = ctx.socket(zmq.PUB)
213 iopub_socket.setsockopt(zmq.IDENTITY, identity)
212 iopub_socket.setsockopt(zmq.IDENTITY, identity)
214 connect(iopub_socket, iopub_addr)
213 connect(iopub_socket, iopub_addr)
215
214
216 # disable history:
215 # disable history:
217 self.config.HistoryManager.hist_file = ':memory:'
216 self.config.HistoryManager.hist_file = ':memory:'
218
217
219 # Redirect input streams and set a display hook.
218 # Redirect input streams and set a display hook.
220 if self.out_stream_factory:
219 if self.out_stream_factory:
221 sys.stdout = self.out_stream_factory(self.session, iopub_socket, u'stdout')
220 sys.stdout = self.out_stream_factory(self.session, iopub_socket, u'stdout')
222 sys.stdout.topic = cast_bytes('engine.%i.stdout' % self.id)
221 sys.stdout.topic = cast_bytes('engine.%i.stdout' % self.id)
223 sys.stderr = self.out_stream_factory(self.session, iopub_socket, u'stderr')
222 sys.stderr = self.out_stream_factory(self.session, iopub_socket, u'stderr')
224 sys.stderr.topic = cast_bytes('engine.%i.stderr' % self.id)
223 sys.stderr.topic = cast_bytes('engine.%i.stderr' % self.id)
225 if self.display_hook_factory:
224 if self.display_hook_factory:
226 sys.displayhook = self.display_hook_factory(self.session, iopub_socket)
225 sys.displayhook = self.display_hook_factory(self.session, iopub_socket)
227 sys.displayhook.topic = cast_bytes('engine.%i.execute_result' % self.id)
226 sys.displayhook.topic = cast_bytes('engine.%i.execute_result' % self.id)
228
227
229 self.kernel = Kernel(parent=self, int_id=self.id, ident=self.ident, session=self.session,
228 self.kernel = Kernel(parent=self, int_id=self.id, ident=self.ident, session=self.session,
230 control_stream=control_stream, shell_streams=shell_streams, iopub_socket=iopub_socket,
229 control_stream=control_stream, shell_streams=shell_streams, iopub_socket=iopub_socket,
231 loop=loop, user_ns=self.user_ns, log=self.log)
230 loop=loop, user_ns=self.user_ns, log=self.log)
232
231
233 self.kernel.shell.display_pub.topic = cast_bytes('engine.%i.displaypub' % self.id)
232 self.kernel.shell.display_pub.topic = cast_bytes('engine.%i.displaypub' % self.id)
234
233
235
234
236 # periodically check the heartbeat pings of the controller
235 # periodically check the heartbeat pings of the controller
237 # Should be started here and not in "start()" so that the right period can be taken
236 # Should be started here and not in "start()" so that the right period can be taken
238 # from the hubs HeartBeatMonitor.period
237 # from the hubs HeartBeatMonitor.period
239 if self.max_heartbeat_misses > 0:
238 if self.max_heartbeat_misses > 0:
240 # Use a slightly bigger check period than the hub signal period to not warn unnecessary
239 # Use a slightly bigger check period than the hub signal period to not warn unnecessary
241 self.hb_check_period = int(content['hb_period'])+10
240 self.hb_check_period = int(content['hb_period'])+10
242 self.log.info("Starting to monitor the heartbeat signal from the hub every %i ms." , self.hb_check_period)
241 self.log.info("Starting to monitor the heartbeat signal from the hub every %i ms." , self.hb_check_period)
243 self._hb_reporter = ioloop.PeriodicCallback(self._hb_monitor, self.hb_check_period, self.loop)
242 self._hb_reporter = ioloop.PeriodicCallback(self._hb_monitor, self.hb_check_period, self.loop)
244 self._hb_reporter.start()
243 self._hb_reporter.start()
245 else:
244 else:
246 self.log.info("Monitoring of the heartbeat signal from the hub is not enabled.")
245 self.log.info("Monitoring of the heartbeat signal from the hub is not enabled.")
247
246
248
247
249 # FIXME: This is a hack until IPKernelApp and IPEngineApp can be fully merged
248 # FIXME: This is a hack until IPKernelApp and IPEngineApp can be fully merged
250 app = IPKernelApp(parent=self, shell=self.kernel.shell, kernel=self.kernel, log=self.log)
249 app = IPKernelApp(parent=self, shell=self.kernel.shell, kernel=self.kernel, log=self.log)
251 app.init_profile_dir()
250 app.init_profile_dir()
252 app.init_code()
251 app.init_code()
253
252
254 self.kernel.start()
253 self.kernel.start()
255 else:
254 else:
256 self.log.fatal("Registration Failed: %s"%msg)
255 self.log.fatal("Registration Failed: %s"%msg)
257 raise Exception("Registration Failed: %s"%msg)
256 raise Exception("Registration Failed: %s"%msg)
258
257
259 self.log.info("Completed registration with id %i"%self.id)
258 self.log.info("Completed registration with id %i"%self.id)
260
259
261
260
262 def abort(self):
261 def abort(self):
263 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
262 self.log.fatal("Registration timed out after %.1f seconds"%self.timeout)
264 if self.url.startswith('127.'):
263 if self.url.startswith('127.'):
265 self.log.fatal("""
264 self.log.fatal("""
266 If the controller and engines are not on the same machine,
265 If the controller and engines are not on the same machine,
267 you will have to instruct the controller to listen on an external IP (in ipcontroller_config.py):
266 you will have to instruct the controller to listen on an external IP (in ipcontroller_config.py):
268 c.HubFactory.ip='*' # for all interfaces, internal and external
267 c.HubFactory.ip='*' # for all interfaces, internal and external
269 c.HubFactory.ip='192.168.1.101' # or any interface that the engines can see
268 c.HubFactory.ip='192.168.1.101' # or any interface that the engines can see
270 or tunnel connections via ssh.
269 or tunnel connections via ssh.
271 """)
270 """)
272 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
271 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
273 time.sleep(1)
272 time.sleep(1)
274 sys.exit(255)
273 sys.exit(255)
275
274
276 def _hb_monitor(self):
275 def _hb_monitor(self):
277 """Callback to monitor the heartbeat from the controller"""
276 """Callback to monitor the heartbeat from the controller"""
278 self._hb_listener.flush()
277 self._hb_listener.flush()
279 if self._hb_last_monitored > self._hb_last_pinged:
278 if self._hb_last_monitored > self._hb_last_pinged:
280 self._hb_missed_beats += 1
279 self._hb_missed_beats += 1
281 self.log.warn("No heartbeat in the last %s ms (%s time(s) in a row).", self.hb_check_period, self._hb_missed_beats)
280 self.log.warn("No heartbeat in the last %s ms (%s time(s) in a row).", self.hb_check_period, self._hb_missed_beats)
282 else:
281 else:
283 #self.log.debug("Heartbeat received (after missing %s beats).", self._hb_missed_beats)
282 #self.log.debug("Heartbeat received (after missing %s beats).", self._hb_missed_beats)
284 self._hb_missed_beats = 0
283 self._hb_missed_beats = 0
285
284
286 if self._hb_missed_beats >= self.max_heartbeat_misses:
285 if self._hb_missed_beats >= self.max_heartbeat_misses:
287 self.log.fatal("Maximum number of heartbeats misses reached (%s times %s ms), shutting down.",
286 self.log.fatal("Maximum number of heartbeats misses reached (%s times %s ms), shutting down.",
288 self.max_heartbeat_misses, self.hb_check_period)
287 self.max_heartbeat_misses, self.hb_check_period)
289 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
288 self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id))
290 self.loop.stop()
289 self.loop.stop()
291
290
292 self._hb_last_monitored = time.time()
291 self._hb_last_monitored = time.time()
293
292
294
293
295 def start(self):
294 def start(self):
296 dc = ioloop.DelayedCallback(self.register, 0, self.loop)
295 loop = self.loop
297 dc.start()
296 def _start():
298 self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop)
297 self.register()
299 self._abort_dc.start()
298 loop.add_timeout(loop.time() + self.timeout, self.abort)
299 self.loop.add_callback(_start)
300
300
301
301
General Comments 0
You need to be logged in to leave comments. Login now