##// END OF EJS Templates
adjustments to PBS/SGE, SSH Launchers + docs update
MinRK -
Show More
@@ -1,192 +1,227 b''
1 import os
1 import os
2
2
3 c = get_config()
3 c = get_config()
4
4
5 #-----------------------------------------------------------------------------
5 #-----------------------------------------------------------------------------
6 # Select which launchers to use
6 # Select which launchers to use
7 #-----------------------------------------------------------------------------
7 #-----------------------------------------------------------------------------
8
8
9 # This allows you to control what method is used to start the controller
9 # This allows you to control what method is used to start the controller
10 # and engines. The following methods are currently supported:
10 # and engines. The following methods are currently supported:
11 # - Start as a regular process on localhost.
11 # - Start as a regular process on localhost.
12 # - Start using mpiexec.
12 # - Start using mpiexec.
13 # - Start using the Windows HPC Server 2008 scheduler
13 # - Start using the Windows HPC Server 2008 scheduler
14 # - Start using PBS
14 # - Start using PBS
15 # - Start using SSH
15 # - Start using SSH
16
16
17
17
18 # The selected launchers can be configured below.
18 # The selected launchers can be configured below.
19
19
20 # Options are:
20 # Options are:
21 # - LocalControllerLauncher
21 # - LocalControllerLauncher
22 # - MPIExecControllerLauncher
22 # - MPIExecControllerLauncher
23 # - PBSControllerLauncher
23 # - PBSControllerLauncher
24 # - WindowsHPCControllerLauncher
24 # - WindowsHPCControllerLauncher
25 # c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.LocalControllerLauncher'
25 # c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.LocalControllerLauncher'
26
26
27 # Options are:
27 # Options are:
28 # - LocalEngineSetLauncher
28 # - LocalEngineSetLauncher
29 # - MPIExecEngineSetLauncher
29 # - MPIExecEngineSetLauncher
30 # - PBSEngineSetLauncher
30 # - PBSEngineSetLauncher
31 # - WindowsHPCEngineSetLauncher
31 # - WindowsHPCEngineSetLauncher
32 # c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.LocalEngineSetLauncher'
32 # c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.LocalEngineSetLauncher'
33
33
34 #-----------------------------------------------------------------------------
34 #-----------------------------------------------------------------------------
35 # Global configuration
35 # Global configuration
36 #-----------------------------------------------------------------------------
36 #-----------------------------------------------------------------------------
37
37
38 # The default number of engines that will be started. This is overridden by
38 # The default number of engines that will be started. This is overridden by
39 # the -n command line option: "ipcluster start -n 4"
39 # the -n command line option: "ipcluster start -n 4"
40 # c.Global.n = 2
40 # c.Global.n = 2
41
41
42 # Log to a file in cluster_dir/log, otherwise just log to sys.stdout.
42 # Log to a file in cluster_dir/log, otherwise just log to sys.stdout.
43 # c.Global.log_to_file = False
43 # c.Global.log_to_file = False
44
44
45 # Remove old logs from cluster_dir/log before starting.
45 # Remove old logs from cluster_dir/log before starting.
46 # c.Global.clean_logs = True
46 # c.Global.clean_logs = True
47
47
48 # The working directory for the process. The application will use os.chdir
48 # The working directory for the process. The application will use os.chdir
49 # to change to this directory before starting.
49 # to change to this directory before starting.
50 # c.Global.work_dir = os.getcwd()
50 # c.Global.work_dir = os.getcwd()
51
51
52
52
53 #-----------------------------------------------------------------------------
53 #-----------------------------------------------------------------------------
54 # Local process launchers
54 # Local process launchers
55 #-----------------------------------------------------------------------------
55 #-----------------------------------------------------------------------------
56
56
57 # The command line arguments to call the controller with.
57 # The command line arguments to call the controller with.
58 # c.LocalControllerLauncher.controller_args = \
58 # c.LocalControllerLauncher.controller_args = \
59 # ['--log-to-file','--log-level', '40']
59 # ['--log-to-file','--log-level', '40']
60
60
61 # The working directory for the controller
61 # The working directory for the controller
62 # c.LocalEngineSetLauncher.work_dir = u''
62 # c.LocalEngineSetLauncher.work_dir = u''
63
63
64 # Command line argument passed to the engines.
64 # Command line argument passed to the engines.
65 # c.LocalEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
65 # c.LocalEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
66
66
67 #-----------------------------------------------------------------------------
67 #-----------------------------------------------------------------------------
68 # MPIExec launchers
68 # MPIExec launchers
69 #-----------------------------------------------------------------------------
69 #-----------------------------------------------------------------------------
70
70
71 # The mpiexec/mpirun command to use in started the controller.
71 # The mpiexec/mpirun command to use in both the controller and engines.
72 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
72 # c.MPIExecLauncher.mpi_cmd = ['mpiexec']
73
73
74 # Additional arguments to pass to the actual mpiexec command.
74 # Additional arguments to pass to the actual mpiexec command.
75 # c.MPIExecLauncher.mpi_args = []
76
77 # The mpiexec/mpirun command and args can be overridden if they should be different
78 # for controller and engines.
79 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
75 # c.MPIExecControllerLauncher.mpi_args = []
80 # c.MPIExecControllerLauncher.mpi_args = []
81 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
82 # c.MPIExecEngineSetLauncher.mpi_args = []
76
83
77 # The command line argument to call the controller with.
84 # The command line argument to call the controller with.
78 # c.MPIExecControllerLauncher.controller_args = \
85 # c.MPIExecControllerLauncher.controller_args = \
79 # ['--log-to-file','--log-level', '40']
86 # ['--log-to-file','--log-level', '40']
80
87
81
82 # The mpiexec/mpirun command to use in started the controller.
83 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
84
85 # Additional arguments to pass to the actual mpiexec command.
86 # c.MPIExecEngineSetLauncher.mpi_args = []
87
88 # Command line argument passed to the engines.
88 # Command line argument passed to the engines.
89 # c.MPIExecEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
89 # c.MPIExecEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
90
90
91 # The default number of engines to start if not given elsewhere.
91 # The default number of engines to start if not given elsewhere.
92 # c.MPIExecEngineSetLauncher.n = 1
92 # c.MPIExecEngineSetLauncher.n = 1
93
93
94 #-----------------------------------------------------------------------------
94 #-----------------------------------------------------------------------------
95 # SSH launchers
95 # SSH launchers
96 #-----------------------------------------------------------------------------
96 #-----------------------------------------------------------------------------
97
97
98 # Todo
98 # ipclusterz can be used to launch controller and engines remotely via ssh.
99 # Note that currently ipclusterz does not do any file distribution, so if
100 # machines are not on a shared filesystem, config and json files must be
101 # distributed. For this reason, the reuse_files defaults to True on an
102 # ssh-launched Controller. This flag can be overridded by the program_args
103 # attribute of c.SSHControllerLauncher.
104
105 # set the ssh cmd for launching remote commands. The default is ['ssh']
106 # c.SSHLauncher.ssh_cmd = ['ssh']
107
108 # set the ssh cmd for launching remote commands. The default is ['ssh']
109 # c.SSHLauncher.ssh_args = ['tt']
110
111 # Set the user and hostname for the controller
112 # c.SSHControllerLauncher.hostname = 'controller.example.com'
113 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
99
114
115 # Set the arguments to be passed to ipcontrollerz
116 # note that remotely launched ipcontrollerz will not get the contents of
117 # the local ipcontrollerz_config.py unless it resides on the *remote host*
118 # in the location specified by the --cluster_dir argument.
119 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
120
121 # Set the default args passed to ipenginez for SSH launched engines
122 # c.SSHEngineSetLauncher.engine_args = ['--mpi', 'mpi4py']
123
124 # SSH engines are launched as a dict of locations/n-engines.
125 # if a value is a tuple instead of an int, it is assumed to be of the form
126 # (n, [args]), setting the arguments to passed to ipenginez on `host`.
127 # otherwise, c.SSHEngineSetLauncher.engine_args will be used as the default.
128
129 # In this case, there will be 3 engines at my.example.com, and
130 # 2 at you@ipython.scipy.org with a special json connector location.
131 # c.SSHEngineSetLauncher.engines = {'my.example.com' : 3,
132 # 'you@ipython.scipy.org' : (2, ['-f', '/path/to/ipcontroller-engine.json']}
133 # }
100
134
101 #-----------------------------------------------------------------------------
135 #-----------------------------------------------------------------------------
102 # Unix batch (PBS) schedulers launchers
136 # Unix batch (PBS) schedulers launchers
103 #-----------------------------------------------------------------------------
137 #-----------------------------------------------------------------------------
104
138
105 # The command line program to use to submit a PBS job.
139 # The command line program to use to submit a PBS job.
106 # c.PBSControllerLauncher.submit_command = 'qsub'
140 # c.PBSControllerLauncher.submit_command = ['qsub']
107
141
108 # The command line program to use to delete a PBS job.
142 # The command line program to use to delete a PBS job.
109 # c.PBSControllerLauncher.delete_command = 'qdel'
143 # c.PBSControllerLauncher.delete_command = ['qdel']
110
144
111 # A regular expression that takes the output of qsub and find the job id.
145 # A regular expression that takes the output of qsub and find the job id.
112 # c.PBSControllerLauncher.job_id_regexp = r'\d+'
146 # c.PBSControllerLauncher.job_id_regexp = r'\d+'
113
147
114 # The batch submission script used to start the controller. This is where
148 # The batch submission script used to start the controller. This is where
115 # environment variables would be setup, etc. This string is interpolated using
149 # environment variables would be setup, etc. This string is interpreted using
116 # the Itpl module in IPython.external. Basically, you can use ${n} for the
150 # the Itpl module in IPython.external. Basically, you can use ${n} for the
117 # number of engine and ${cluster_dir} for the cluster_dir.
151 # number of engine and ${cluster_dir} for the cluster_dir.
118 # c.PBSControllerLauncher.batch_template = """
152 # c.PBSControllerLauncher.batch_template = """
119 # #PBS -l nprocs=$n
153 # #PBS -N ipcontroller
120 #
154 #
121 # ipcontrollerz --cluster-dir $cluster_dir
155 # ipcontrollerz --cluster-dir $cluster_dir
122 # """
156 # """
123
157
124 # The name of the instantiated batch script that will actually be used to
158 # The name of the instantiated batch script that will actually be used to
125 # submit the job. This will be written to the cluster directory.
159 # submit the job. This will be written to the cluster directory.
126 # c.PBSControllerLauncher.batch_file_name = u'pbs_batch_script_controller'
160 # c.PBSControllerLauncher.batch_file_name = u'pbs_batch_script_controller'
127
161
128
162
129 # The command line program to use to submit a PBS job.
163 # The command line program to use to submit a PBS job.
130 # c.PBSEngineSetLauncher.submit_command = 'qsub'
164 # c.PBSEngineSetLauncher.submit_command = 'qsub'
131
165
132 # The command line program to use to delete a PBS job.
166 # The command line program to use to delete a PBS job.
133 # c.PBSEngineSetLauncher.delete_command = 'qdel'
167 # c.PBSEngineSetLauncher.delete_command = 'qdel'
134
168
135 # A regular expression that takes the output of qsub and find the job id.
169 # A regular expression that takes the output of qsub and find the job id.
136 # c.PBSEngineSetLauncher.job_id_regexp = r'\d+'
170 # c.PBSEngineSetLauncher.job_id_regexp = r'\d+'
137
171
138 # The batch submission script used to start the engines. This is where
172 # The batch submission script used to start the engines. This is where
139 # environment variables would be setup, etc. This string is interpolated using
173 # environment variables would be setup, etc. This string is interpreted using
140 # the Itpl module in IPython.external. Basically, you can use ${n} for the
174 # the Itpl module in IPython.external. Basically, you can use ${n} for the
141 # number of engine and ${cluster_dir} for the cluster_dir.
175 # number of engine and ${cluster_dir} for the cluster_dir.
142 # c.PBSEngineSetLauncher.batch_template = """
176 # c.PBSEngineSetLauncher.batch_template = """
177 # #PBS -N ipcontroller
143 # #PBS -l nprocs=$n
178 # #PBS -l nprocs=$n
144 #
179 #
145 # ipenginez --cluster-dir $cluster_dir$s
180 # ipenginez --cluster-dir $cluster_dir$s
146 # """
181 # """
147
182
148 # The name of the instantiated batch script that will actually be used to
183 # The name of the instantiated batch script that will actually be used to
149 # submit the job. This will be written to the cluster directory.
184 # submit the job. This will be written to the cluster directory.
150 # c.PBSEngineSetLauncher.batch_file_name = u'pbs_batch_script_engines'
185 # c.PBSEngineSetLauncher.batch_file_name = u'pbs_batch_script_engines'
151
186
152 #-----------------------------------------------------------------------------
187 #-----------------------------------------------------------------------------
153 # Windows HPC Server 2008 launcher configuration
188 # Windows HPC Server 2008 launcher configuration
154 #-----------------------------------------------------------------------------
189 #-----------------------------------------------------------------------------
155
190
156 # c.IPControllerJob.job_name = 'IPController'
191 # c.IPControllerJob.job_name = 'IPController'
157 # c.IPControllerJob.is_exclusive = False
192 # c.IPControllerJob.is_exclusive = False
158 # c.IPControllerJob.username = r'USERDOMAIN\USERNAME'
193 # c.IPControllerJob.username = r'USERDOMAIN\USERNAME'
159 # c.IPControllerJob.priority = 'Highest'
194 # c.IPControllerJob.priority = 'Highest'
160 # c.IPControllerJob.requested_nodes = ''
195 # c.IPControllerJob.requested_nodes = ''
161 # c.IPControllerJob.project = 'MyProject'
196 # c.IPControllerJob.project = 'MyProject'
162
197
163 # c.IPControllerTask.task_name = 'IPController'
198 # c.IPControllerTask.task_name = 'IPController'
164 # c.IPControllerTask.controller_cmd = [u'ipcontroller.exe']
199 # c.IPControllerTask.controller_cmd = [u'ipcontroller.exe']
165 # c.IPControllerTask.controller_args = ['--log-to-file', '--log-level', '40']
200 # c.IPControllerTask.controller_args = ['--log-to-file', '--log-level', '40']
166 # c.IPControllerTask.environment_variables = {}
201 # c.IPControllerTask.environment_variables = {}
167
202
168 # c.WindowsHPCControllerLauncher.scheduler = 'HEADNODE'
203 # c.WindowsHPCControllerLauncher.scheduler = 'HEADNODE'
169 # c.WindowsHPCControllerLauncher.job_file_name = u'ipcontroller_job.xml'
204 # c.WindowsHPCControllerLauncher.job_file_name = u'ipcontroller_job.xml'
170
205
171
206
172 # c.IPEngineSetJob.job_name = 'IPEngineSet'
207 # c.IPEngineSetJob.job_name = 'IPEngineSet'
173 # c.IPEngineSetJob.is_exclusive = False
208 # c.IPEngineSetJob.is_exclusive = False
174 # c.IPEngineSetJob.username = r'USERDOMAIN\USERNAME'
209 # c.IPEngineSetJob.username = r'USERDOMAIN\USERNAME'
175 # c.IPEngineSetJob.priority = 'Highest'
210 # c.IPEngineSetJob.priority = 'Highest'
176 # c.IPEngineSetJob.requested_nodes = ''
211 # c.IPEngineSetJob.requested_nodes = ''
177 # c.IPEngineSetJob.project = 'MyProject'
212 # c.IPEngineSetJob.project = 'MyProject'
178
213
179 # c.IPEngineTask.task_name = 'IPEngine'
214 # c.IPEngineTask.task_name = 'IPEngine'
180 # c.IPEngineTask.engine_cmd = [u'ipengine.exe']
215 # c.IPEngineTask.engine_cmd = [u'ipengine.exe']
181 # c.IPEngineTask.engine_args = ['--log-to-file', '--log-level', '40']
216 # c.IPEngineTask.engine_args = ['--log-to-file', '--log-level', '40']
182 # c.IPEngineTask.environment_variables = {}
217 # c.IPEngineTask.environment_variables = {}
183
218
184 # c.WindowsHPCEngineSetLauncher.scheduler = 'HEADNODE'
219 # c.WindowsHPCEngineSetLauncher.scheduler = 'HEADNODE'
185 # c.WindowsHPCEngineSetLauncher.job_file_name = u'ipengineset_job.xml'
220 # c.WindowsHPCEngineSetLauncher.job_file_name = u'ipengineset_job.xml'
186
221
187
222
188
223
189
224
190
225
191
226
192
227
@@ -1,18 +1,18 b''
1 """The IPython ZMQ-based parallel computing interface."""
1 """The IPython ZMQ-based parallel computing interface."""
2 #-----------------------------------------------------------------------------
2 #-----------------------------------------------------------------------------
3 # Copyright (C) 2011 The IPython Development Team
3 # Copyright (C) 2011 The IPython Development Team
4 #
4 #
5 # Distributed under the terms of the BSD License. The full license is in
5 # Distributed under the terms of the BSD License. The full license is in
6 # the file COPYING, distributed as part of this software.
6 # the file COPYING, distributed as part of this software.
7 #-----------------------------------------------------------------------------
7 #-----------------------------------------------------------------------------
8
8
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10 # Imports
10 # Imports
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12
12
13 from .asyncresult import *
13 # from .asyncresult import *
14 from .client import Client
14 # from .client import Client
15 from .dependency import *
15 # from .dependency import *
16 from .remotefunction import *
16 # from .remotefunction import *
17 from .view import *
17 # from .view import *
18
18
@@ -1,847 +1,879 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 """
3 """
4 Facilities for launching IPython processes asynchronously.
4 Facilities for launching IPython processes asynchronously.
5 """
5 """
6
6
7 #-----------------------------------------------------------------------------
7 #-----------------------------------------------------------------------------
8 # Copyright (C) 2008-2009 The IPython Development Team
8 # Copyright (C) 2008-2009 The IPython Development Team
9 #
9 #
10 # Distributed under the terms of the BSD License. The full license is in
10 # Distributed under the terms of the BSD License. The full license is in
11 # the file COPYING, distributed as part of this software.
11 # the file COPYING, distributed as part of this software.
12 #-----------------------------------------------------------------------------
12 #-----------------------------------------------------------------------------
13
13
14 #-----------------------------------------------------------------------------
14 #-----------------------------------------------------------------------------
15 # Imports
15 # Imports
16 #-----------------------------------------------------------------------------
16 #-----------------------------------------------------------------------------
17
17
18 import copy
18 import logging
19 import logging
19 import os
20 import os
20 import re
21 import re
21 import sys
22
22
23 from signal import SIGINT, SIGTERM
23 from signal import SIGINT, SIGTERM
24 try:
24 try:
25 from signal import SIGKILL
25 from signal import SIGKILL
26 except ImportError:
26 except ImportError:
27 SIGKILL=SIGTERM
27 SIGKILL=SIGTERM
28
28
29 from subprocess import Popen, PIPE, STDOUT
29 from subprocess import Popen, PIPE, STDOUT
30 try:
30 try:
31 from subprocess import check_output
31 from subprocess import check_output
32 except ImportError:
32 except ImportError:
33 # pre-2.7:
33 # pre-2.7, define check_output with Popen
34 from StringIO import StringIO
35
36 def check_output(*args, **kwargs):
34 def check_output(*args, **kwargs):
37 sio = StringIO()
35 kwargs.update(dict(stdout=PIPE))
38 kwargs.update(dict(stdout=PIPE, stderr=STDOUT))
39 p = Popen(*args, **kwargs)
36 p = Popen(*args, **kwargs)
40 out,err = p.communicate()
37 out,err = p.communicate()
41 return out
38 return out
42
39
43 from zmq.eventloop import ioloop
40 from zmq.eventloop import ioloop
44
41
45 from IPython.external import Itpl
42 from IPython.external import Itpl
46 # from IPython.config.configurable import Configurable
43 # from IPython.config.configurable import Configurable
47 from IPython.utils.traitlets import Str, Int, List, Unicode, Dict, Instance
44 from IPython.utils.traitlets import Any, Str, Int, List, Unicode, Dict, Instance
48 from IPython.utils.path import get_ipython_module_path
45 from IPython.utils.path import get_ipython_module_path
49 from IPython.utils.process import find_cmd, pycmd2argv, FindCmdError
46 from IPython.utils.process import find_cmd, pycmd2argv, FindCmdError
50
47
51 from .factory import LoggingFactory
48 from .factory import LoggingFactory
52
49
53 # load winhpcjob from IPython.kernel
50 # load winhpcjob from IPython.kernel
54 try:
51 try:
55 from IPython.kernel.winhpcjob import (
52 from IPython.kernel.winhpcjob import (
56 IPControllerTask, IPEngineTask,
53 IPControllerTask, IPEngineTask,
57 IPControllerJob, IPEngineSetJob
54 IPControllerJob, IPEngineSetJob
58 )
55 )
59 except ImportError:
56 except ImportError:
60 pass
57 pass
61
58
62
59
63 #-----------------------------------------------------------------------------
60 #-----------------------------------------------------------------------------
64 # Paths to the kernel apps
61 # Paths to the kernel apps
65 #-----------------------------------------------------------------------------
62 #-----------------------------------------------------------------------------
66
63
67
64
68 ipcluster_cmd_argv = pycmd2argv(get_ipython_module_path(
65 ipcluster_cmd_argv = pycmd2argv(get_ipython_module_path(
69 'IPython.zmq.parallel.ipclusterapp'
66 'IPython.zmq.parallel.ipclusterapp'
70 ))
67 ))
71
68
72 ipengine_cmd_argv = pycmd2argv(get_ipython_module_path(
69 ipengine_cmd_argv = pycmd2argv(get_ipython_module_path(
73 'IPython.zmq.parallel.ipengineapp'
70 'IPython.zmq.parallel.ipengineapp'
74 ))
71 ))
75
72
76 ipcontroller_cmd_argv = pycmd2argv(get_ipython_module_path(
73 ipcontroller_cmd_argv = pycmd2argv(get_ipython_module_path(
77 'IPython.zmq.parallel.ipcontrollerapp'
74 'IPython.zmq.parallel.ipcontrollerapp'
78 ))
75 ))
79
76
80 #-----------------------------------------------------------------------------
77 #-----------------------------------------------------------------------------
81 # Base launchers and errors
78 # Base launchers and errors
82 #-----------------------------------------------------------------------------
79 #-----------------------------------------------------------------------------
83
80
84
81
85 class LauncherError(Exception):
82 class LauncherError(Exception):
86 pass
83 pass
87
84
88
85
89 class ProcessStateError(LauncherError):
86 class ProcessStateError(LauncherError):
90 pass
87 pass
91
88
92
89
93 class UnknownStatus(LauncherError):
90 class UnknownStatus(LauncherError):
94 pass
91 pass
95
92
96
93
97 class BaseLauncher(LoggingFactory):
94 class BaseLauncher(LoggingFactory):
98 """An asbtraction for starting, stopping and signaling a process."""
95 """An asbtraction for starting, stopping and signaling a process."""
99
96
100 # In all of the launchers, the work_dir is where child processes will be
97 # In all of the launchers, the work_dir is where child processes will be
101 # run. This will usually be the cluster_dir, but may not be. any work_dir
98 # run. This will usually be the cluster_dir, but may not be. any work_dir
102 # passed into the __init__ method will override the config value.
99 # passed into the __init__ method will override the config value.
103 # This should not be used to set the work_dir for the actual engine
100 # This should not be used to set the work_dir for the actual engine
104 # and controller. Instead, use their own config files or the
101 # and controller. Instead, use their own config files or the
105 # controller_args, engine_args attributes of the launchers to add
102 # controller_args, engine_args attributes of the launchers to add
106 # the --work-dir option.
103 # the --work-dir option.
107 work_dir = Unicode(u'.')
104 work_dir = Unicode(u'.')
108 loop = Instance('zmq.eventloop.ioloop.IOLoop')
105 loop = Instance('zmq.eventloop.ioloop.IOLoop')
106
107 start_data = Any()
108 stop_data = Any()
109
109 def _loop_default(self):
110 def _loop_default(self):
110 return ioloop.IOLoop.instance()
111 return ioloop.IOLoop.instance()
111
112
112 def __init__(self, work_dir=u'.', config=None, **kwargs):
113 def __init__(self, work_dir=u'.', config=None, **kwargs):
113 super(BaseLauncher, self).__init__(work_dir=work_dir, config=config, **kwargs)
114 super(BaseLauncher, self).__init__(work_dir=work_dir, config=config, **kwargs)
114 self.state = 'before' # can be before, running, after
115 self.state = 'before' # can be before, running, after
115 self.stop_callbacks = []
116 self.stop_callbacks = []
116 self.start_data = None
117 self.start_data = None
117 self.stop_data = None
118 self.stop_data = None
118
119
119 @property
120 @property
120 def args(self):
121 def args(self):
121 """A list of cmd and args that will be used to start the process.
122 """A list of cmd and args that will be used to start the process.
122
123
123 This is what is passed to :func:`spawnProcess` and the first element
124 This is what is passed to :func:`spawnProcess` and the first element
124 will be the process name.
125 will be the process name.
125 """
126 """
126 return self.find_args()
127 return self.find_args()
127
128
128 def find_args(self):
129 def find_args(self):
129 """The ``.args`` property calls this to find the args list.
130 """The ``.args`` property calls this to find the args list.
130
131
131 Subcommand should implement this to construct the cmd and args.
132 Subcommand should implement this to construct the cmd and args.
132 """
133 """
133 raise NotImplementedError('find_args must be implemented in a subclass')
134 raise NotImplementedError('find_args must be implemented in a subclass')
134
135
135 @property
136 @property
136 def arg_str(self):
137 def arg_str(self):
137 """The string form of the program arguments."""
138 """The string form of the program arguments."""
138 return ' '.join(self.args)
139 return ' '.join(self.args)
139
140
140 @property
141 @property
141 def running(self):
142 def running(self):
142 """Am I running."""
143 """Am I running."""
143 if self.state == 'running':
144 if self.state == 'running':
144 return True
145 return True
145 else:
146 else:
146 return False
147 return False
147
148
148 def start(self):
149 def start(self):
149 """Start the process.
150 """Start the process.
150
151
151 This must return a deferred that fires with information about the
152 This must return a deferred that fires with information about the
152 process starting (like a pid, job id, etc.).
153 process starting (like a pid, job id, etc.).
153 """
154 """
154 raise NotImplementedError('start must be implemented in a subclass')
155 raise NotImplementedError('start must be implemented in a subclass')
155
156
156 def stop(self):
157 def stop(self):
157 """Stop the process and notify observers of stopping.
158 """Stop the process and notify observers of stopping.
158
159
159 This must return a deferred that fires with information about the
160 This must return a deferred that fires with information about the
160 processing stopping, like errors that occur while the process is
161 processing stopping, like errors that occur while the process is
161 attempting to be shut down. This deferred won't fire when the process
162 attempting to be shut down. This deferred won't fire when the process
162 actually stops. To observe the actual process stopping, see
163 actually stops. To observe the actual process stopping, see
163 :func:`observe_stop`.
164 :func:`observe_stop`.
164 """
165 """
165 raise NotImplementedError('stop must be implemented in a subclass')
166 raise NotImplementedError('stop must be implemented in a subclass')
166
167
167 def on_stop(self, f):
168 def on_stop(self, f):
168 """Get a deferred that will fire when the process stops.
169 """Get a deferred that will fire when the process stops.
169
170
170 The deferred will fire with data that contains information about
171 The deferred will fire with data that contains information about
171 the exit status of the process.
172 the exit status of the process.
172 """
173 """
173 if self.state=='after':
174 if self.state=='after':
174 return f(self.stop_data)
175 return f(self.stop_data)
175 else:
176 else:
176 self.stop_callbacks.append(f)
177 self.stop_callbacks.append(f)
177
178
178 def notify_start(self, data):
179 def notify_start(self, data):
179 """Call this to trigger startup actions.
180 """Call this to trigger startup actions.
180
181
181 This logs the process startup and sets the state to 'running'. It is
182 This logs the process startup and sets the state to 'running'. It is
182 a pass-through so it can be used as a callback.
183 a pass-through so it can be used as a callback.
183 """
184 """
184
185
185 self.log.info('Process %r started: %r' % (self.args[0], data))
186 self.log.info('Process %r started: %r' % (self.args[0], data))
186 self.start_data = data
187 self.start_data = data
187 self.state = 'running'
188 self.state = 'running'
188 return data
189 return data
189
190
190 def notify_stop(self, data):
191 def notify_stop(self, data):
191 """Call this to trigger process stop actions.
192 """Call this to trigger process stop actions.
192
193
193 This logs the process stopping and sets the state to 'after'. Call
194 This logs the process stopping and sets the state to 'after'. Call
194 this to trigger all the deferreds from :func:`observe_stop`."""
195 this to trigger all the deferreds from :func:`observe_stop`."""
195
196
196 self.log.info('Process %r stopped: %r' % (self.args[0], data))
197 self.log.info('Process %r stopped: %r' % (self.args[0], data))
197 self.stop_data = data
198 self.stop_data = data
198 self.state = 'after'
199 self.state = 'after'
199 for i in range(len(self.stop_callbacks)):
200 for i in range(len(self.stop_callbacks)):
200 d = self.stop_callbacks.pop()
201 d = self.stop_callbacks.pop()
201 d(data)
202 d(data)
202 return data
203 return data
203
204
204 def signal(self, sig):
205 def signal(self, sig):
205 """Signal the process.
206 """Signal the process.
206
207
207 Return a semi-meaningless deferred after signaling the process.
208 Return a semi-meaningless deferred after signaling the process.
208
209
209 Parameters
210 Parameters
210 ----------
211 ----------
211 sig : str or int
212 sig : str or int
212 'KILL', 'INT', etc., or any signal number
213 'KILL', 'INT', etc., or any signal number
213 """
214 """
214 raise NotImplementedError('signal must be implemented in a subclass')
215 raise NotImplementedError('signal must be implemented in a subclass')
215
216
216
217
217 #-----------------------------------------------------------------------------
218 #-----------------------------------------------------------------------------
218 # Local process launchers
219 # Local process launchers
219 #-----------------------------------------------------------------------------
220 #-----------------------------------------------------------------------------
220
221
221
222
222 class LocalProcessLauncher(BaseLauncher):
223 class LocalProcessLauncher(BaseLauncher):
223 """Start and stop an external process in an asynchronous manner.
224 """Start and stop an external process in an asynchronous manner.
224
225
225 This will launch the external process with a working directory of
226 This will launch the external process with a working directory of
226 ``self.work_dir``.
227 ``self.work_dir``.
227 """
228 """
228
229
229 # This is used to to construct self.args, which is passed to
230 # This is used to to construct self.args, which is passed to
230 # spawnProcess.
231 # spawnProcess.
231 cmd_and_args = List([])
232 cmd_and_args = List([])
232 poll_frequency = Int(100) # in ms
233 poll_frequency = Int(100) # in ms
233
234
234 def __init__(self, work_dir=u'.', config=None, **kwargs):
235 def __init__(self, work_dir=u'.', config=None, **kwargs):
235 super(LocalProcessLauncher, self).__init__(
236 super(LocalProcessLauncher, self).__init__(
236 work_dir=work_dir, config=config, **kwargs
237 work_dir=work_dir, config=config, **kwargs
237 )
238 )
238 self.process = None
239 self.process = None
239 self.start_deferred = None
240 self.start_deferred = None
240 self.poller = None
241 self.poller = None
241
242
242 def find_args(self):
243 def find_args(self):
243 return self.cmd_and_args
244 return self.cmd_and_args
244
245
245 def start(self):
246 def start(self):
246 if self.state == 'before':
247 if self.state == 'before':
247 self.process = Popen(self.args,
248 self.process = Popen(self.args,
248 stdout=PIPE,stderr=PIPE,stdin=PIPE,
249 stdout=PIPE,stderr=PIPE,stdin=PIPE,
249 env=os.environ,
250 env=os.environ,
250 cwd=self.work_dir
251 cwd=self.work_dir
251 )
252 )
252
253
253 self.loop.add_handler(self.process.stdout.fileno(), self.handle_stdout, self.loop.READ)
254 self.loop.add_handler(self.process.stdout.fileno(), self.handle_stdout, self.loop.READ)
254 self.loop.add_handler(self.process.stderr.fileno(), self.handle_stderr, self.loop.READ)
255 self.loop.add_handler(self.process.stderr.fileno(), self.handle_stderr, self.loop.READ)
255 self.poller = ioloop.PeriodicCallback(self.poll, self.poll_frequency, self.loop)
256 self.poller = ioloop.PeriodicCallback(self.poll, self.poll_frequency, self.loop)
256 self.poller.start()
257 self.poller.start()
257 self.notify_start(self.process.pid)
258 self.notify_start(self.process.pid)
258 else:
259 else:
259 s = 'The process was already started and has state: %r' % self.state
260 s = 'The process was already started and has state: %r' % self.state
260 raise ProcessStateError(s)
261 raise ProcessStateError(s)
261
262
262 def stop(self):
263 def stop(self):
263 return self.interrupt_then_kill()
264 return self.interrupt_then_kill()
264
265
265 def signal(self, sig):
266 def signal(self, sig):
266 if self.state == 'running':
267 if self.state == 'running':
267 self.process.send_signal(sig)
268 self.process.send_signal(sig)
268
269
269 def interrupt_then_kill(self, delay=2.0):
270 def interrupt_then_kill(self, delay=2.0):
270 """Send INT, wait a delay and then send KILL."""
271 """Send INT, wait a delay and then send KILL."""
271 self.signal(SIGINT)
272 self.signal(SIGINT)
272 self.killer = ioloop.DelayedCallback(lambda : self.signal(SIGKILL), delay*1000, self.loop)
273 self.killer = ioloop.DelayedCallback(lambda : self.signal(SIGKILL), delay*1000, self.loop)
273 self.killer.start()
274 self.killer.start()
274
275
275 # callbacks, etc:
276 # callbacks, etc:
276
277
277 def handle_stdout(self, fd, events):
278 def handle_stdout(self, fd, events):
278 line = self.process.stdout.readline()
279 line = self.process.stdout.readline()
279 # a stopped process will be readable but return empty strings
280 # a stopped process will be readable but return empty strings
280 if line:
281 if line:
281 self.log.info(line[:-1])
282 self.log.info(line[:-1])
282 else:
283 else:
283 self.poll()
284 self.poll()
284
285
285 def handle_stderr(self, fd, events):
286 def handle_stderr(self, fd, events):
286 line = self.process.stderr.readline()
287 line = self.process.stderr.readline()
287 # a stopped process will be readable but return empty strings
288 # a stopped process will be readable but return empty strings
288 if line:
289 if line:
289 self.log.error(line[:-1])
290 self.log.error(line[:-1])
290 else:
291 else:
291 self.poll()
292 self.poll()
292
293
293 def poll(self):
294 def poll(self):
294 status = self.process.poll()
295 status = self.process.poll()
295 if status is not None:
296 if status is not None:
296 self.poller.stop()
297 self.poller.stop()
297 self.loop.remove_handler(self.process.stdout.fileno())
298 self.loop.remove_handler(self.process.stdout.fileno())
298 self.loop.remove_handler(self.process.stderr.fileno())
299 self.loop.remove_handler(self.process.stderr.fileno())
299 self.notify_stop(dict(exit_code=status, pid=self.process.pid))
300 self.notify_stop(dict(exit_code=status, pid=self.process.pid))
300 return status
301 return status
301
302
302 class LocalControllerLauncher(LocalProcessLauncher):
303 class LocalControllerLauncher(LocalProcessLauncher):
303 """Launch a controller as a regular external process."""
304 """Launch a controller as a regular external process."""
304
305
305 controller_cmd = List(ipcontroller_cmd_argv, config=True)
306 controller_cmd = List(ipcontroller_cmd_argv, config=True)
306 # Command line arguments to ipcontroller.
307 # Command line arguments to ipcontroller.
307 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
308 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
308
309
309 def find_args(self):
310 def find_args(self):
310 return self.controller_cmd + self.controller_args
311 return self.controller_cmd + self.controller_args
311
312
312 def start(self, cluster_dir):
313 def start(self, cluster_dir):
313 """Start the controller by cluster_dir."""
314 """Start the controller by cluster_dir."""
314 self.controller_args.extend(['--cluster-dir', cluster_dir])
315 self.controller_args.extend(['--cluster-dir', cluster_dir])
315 self.cluster_dir = unicode(cluster_dir)
316 self.cluster_dir = unicode(cluster_dir)
316 self.log.info("Starting LocalControllerLauncher: %r" % self.args)
317 self.log.info("Starting LocalControllerLauncher: %r" % self.args)
317 return super(LocalControllerLauncher, self).start()
318 return super(LocalControllerLauncher, self).start()
318
319
319
320
320 class LocalEngineLauncher(LocalProcessLauncher):
321 class LocalEngineLauncher(LocalProcessLauncher):
321 """Launch a single engine as a regular externall process."""
322 """Launch a single engine as a regular externall process."""
322
323
323 engine_cmd = List(ipengine_cmd_argv, config=True)
324 engine_cmd = List(ipengine_cmd_argv, config=True)
324 # Command line arguments for ipengine.
325 # Command line arguments for ipengine.
325 engine_args = List(
326 engine_args = List(
326 ['--log-to-file','--log-level', str(logging.INFO)], config=True
327 ['--log-to-file','--log-level', str(logging.INFO)], config=True
327 )
328 )
328
329
329 def find_args(self):
330 def find_args(self):
330 return self.engine_cmd + self.engine_args
331 return self.engine_cmd + self.engine_args
331
332
332 def start(self, cluster_dir):
333 def start(self, cluster_dir):
333 """Start the engine by cluster_dir."""
334 """Start the engine by cluster_dir."""
334 self.engine_args.extend(['--cluster-dir', cluster_dir])
335 self.engine_args.extend(['--cluster-dir', cluster_dir])
335 self.cluster_dir = unicode(cluster_dir)
336 self.cluster_dir = unicode(cluster_dir)
336 return super(LocalEngineLauncher, self).start()
337 return super(LocalEngineLauncher, self).start()
337
338
338
339
339 class LocalEngineSetLauncher(BaseLauncher):
340 class LocalEngineSetLauncher(BaseLauncher):
340 """Launch a set of engines as regular external processes."""
341 """Launch a set of engines as regular external processes."""
341
342
342 # Command line arguments for ipengine.
343 # Command line arguments for ipengine.
343 engine_args = List(
344 engine_args = List(
344 ['--log-to-file','--log-level', str(logging.INFO)], config=True
345 ['--log-to-file','--log-level', str(logging.INFO)], config=True
345 )
346 )
346 # launcher class
347 # launcher class
347 launcher_class = LocalEngineLauncher
348 launcher_class = LocalEngineLauncher
348
349
350 launchers = Dict()
351 stop_data = Dict()
352
349 def __init__(self, work_dir=u'.', config=None, **kwargs):
353 def __init__(self, work_dir=u'.', config=None, **kwargs):
350 super(LocalEngineSetLauncher, self).__init__(
354 super(LocalEngineSetLauncher, self).__init__(
351 work_dir=work_dir, config=config, **kwargs
355 work_dir=work_dir, config=config, **kwargs
352 )
356 )
353 self.launchers = {}
354 self.stop_data = {}
357 self.stop_data = {}
355
358
356 def start(self, n, cluster_dir):
359 def start(self, n, cluster_dir):
357 """Start n engines by profile or cluster_dir."""
360 """Start n engines by profile or cluster_dir."""
358 self.cluster_dir = unicode(cluster_dir)
361 self.cluster_dir = unicode(cluster_dir)
359 dlist = []
362 dlist = []
360 for i in range(n):
363 for i in range(n):
361 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
364 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
362 # Copy the engine args over to each engine launcher.
365 # Copy the engine args over to each engine launcher.
363 import copy
364 el.engine_args = copy.deepcopy(self.engine_args)
366 el.engine_args = copy.deepcopy(self.engine_args)
365 el.on_stop(self._notice_engine_stopped)
367 el.on_stop(self._notice_engine_stopped)
366 d = el.start(cluster_dir)
368 d = el.start(cluster_dir)
367 if i==0:
369 if i==0:
368 self.log.info("Starting LocalEngineSetLauncher: %r" % el.args)
370 self.log.info("Starting LocalEngineSetLauncher: %r" % el.args)
369 self.launchers[i] = el
371 self.launchers[i] = el
370 dlist.append(d)
372 dlist.append(d)
371 self.notify_start(dlist)
373 self.notify_start(dlist)
372 # The consumeErrors here could be dangerous
374 # The consumeErrors here could be dangerous
373 # dfinal = gatherBoth(dlist, consumeErrors=True)
375 # dfinal = gatherBoth(dlist, consumeErrors=True)
374 # dfinal.addCallback(self.notify_start)
376 # dfinal.addCallback(self.notify_start)
375 return dlist
377 return dlist
376
378
377 def find_args(self):
379 def find_args(self):
378 return ['engine set']
380 return ['engine set']
379
381
380 def signal(self, sig):
382 def signal(self, sig):
381 dlist = []
383 dlist = []
382 for el in self.launchers.itervalues():
384 for el in self.launchers.itervalues():
383 d = el.signal(sig)
385 d = el.signal(sig)
384 dlist.append(d)
386 dlist.append(d)
385 # dfinal = gatherBoth(dlist, consumeErrors=True)
387 # dfinal = gatherBoth(dlist, consumeErrors=True)
386 return dlist
388 return dlist
387
389
388 def interrupt_then_kill(self, delay=1.0):
390 def interrupt_then_kill(self, delay=1.0):
389 dlist = []
391 dlist = []
390 for el in self.launchers.itervalues():
392 for el in self.launchers.itervalues():
391 d = el.interrupt_then_kill(delay)
393 d = el.interrupt_then_kill(delay)
392 dlist.append(d)
394 dlist.append(d)
393 # dfinal = gatherBoth(dlist, consumeErrors=True)
395 # dfinal = gatherBoth(dlist, consumeErrors=True)
394 return dlist
396 return dlist
395
397
396 def stop(self):
398 def stop(self):
397 return self.interrupt_then_kill()
399 return self.interrupt_then_kill()
398
400
399 def _notice_engine_stopped(self, data):
401 def _notice_engine_stopped(self, data):
400 print "notice", data
401 pid = data['pid']
402 pid = data['pid']
402 for idx,el in self.launchers.iteritems():
403 for idx,el in self.launchers.iteritems():
403 if el.process.pid == pid:
404 if el.process.pid == pid:
404 break
405 break
405 self.launchers.pop(idx)
406 self.launchers.pop(idx)
406 self.stop_data[idx] = data
407 self.stop_data[idx] = data
407 if not self.launchers:
408 if not self.launchers:
408 self.notify_stop(self.stop_data)
409 self.notify_stop(self.stop_data)
409
410
410
411
411 #-----------------------------------------------------------------------------
412 #-----------------------------------------------------------------------------
412 # MPIExec launchers
413 # MPIExec launchers
413 #-----------------------------------------------------------------------------
414 #-----------------------------------------------------------------------------
414
415
415
416
416 class MPIExecLauncher(LocalProcessLauncher):
417 class MPIExecLauncher(LocalProcessLauncher):
417 """Launch an external process using mpiexec."""
418 """Launch an external process using mpiexec."""
418
419
419 # The mpiexec command to use in starting the process.
420 # The mpiexec command to use in starting the process.
420 mpi_cmd = List(['mpiexec'], config=True)
421 mpi_cmd = List(['mpiexec'], config=True)
421 # The command line arguments to pass to mpiexec.
422 # The command line arguments to pass to mpiexec.
422 mpi_args = List([], config=True)
423 mpi_args = List([], config=True)
423 # The program to start using mpiexec.
424 # The program to start using mpiexec.
424 program = List(['date'], config=True)
425 program = List(['date'], config=True)
425 # The command line argument to the program.
426 # The command line argument to the program.
426 program_args = List([], config=True)
427 program_args = List([], config=True)
427 # The number of instances of the program to start.
428 # The number of instances of the program to start.
428 n = Int(1, config=True)
429 n = Int(1, config=True)
429
430
430 def find_args(self):
431 def find_args(self):
431 """Build self.args using all the fields."""
432 """Build self.args using all the fields."""
432 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
433 return self.mpi_cmd + ['-n', str(self.n)] + self.mpi_args + \
433 self.program + self.program_args
434 self.program + self.program_args
434
435
435 def start(self, n):
436 def start(self, n):
436 """Start n instances of the program using mpiexec."""
437 """Start n instances of the program using mpiexec."""
437 self.n = n
438 self.n = n
438 return super(MPIExecLauncher, self).start()
439 return super(MPIExecLauncher, self).start()
439
440
440
441
441 class MPIExecControllerLauncher(MPIExecLauncher):
442 class MPIExecControllerLauncher(MPIExecLauncher):
442 """Launch a controller using mpiexec."""
443 """Launch a controller using mpiexec."""
443
444
444 controller_cmd = List(ipcontroller_cmd_argv, config=True)
445 controller_cmd = List(ipcontroller_cmd_argv, config=True)
445 # Command line arguments to ipcontroller.
446 # Command line arguments to ipcontroller.
446 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
447 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
447 n = Int(1, config=False)
448 n = Int(1, config=False)
448
449
449 def start(self, cluster_dir):
450 def start(self, cluster_dir):
450 """Start the controller by cluster_dir."""
451 """Start the controller by cluster_dir."""
451 self.controller_args.extend(['--cluster-dir', cluster_dir])
452 self.controller_args.extend(['--cluster-dir', cluster_dir])
452 self.cluster_dir = unicode(cluster_dir)
453 self.cluster_dir = unicode(cluster_dir)
453 self.log.info("Starting MPIExecControllerLauncher: %r" % self.args)
454 self.log.info("Starting MPIExecControllerLauncher: %r" % self.args)
454 return super(MPIExecControllerLauncher, self).start(1)
455 return super(MPIExecControllerLauncher, self).start(1)
455
456
456 def find_args(self):
457 def find_args(self):
457 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
458 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
458 self.controller_cmd + self.controller_args
459 self.controller_cmd + self.controller_args
459
460
460
461
461 class MPIExecEngineSetLauncher(MPIExecLauncher):
462 class MPIExecEngineSetLauncher(MPIExecLauncher):
462
463
463 engine_cmd = List(ipengine_cmd_argv, config=True)
464 program = List(ipengine_cmd_argv, config=True)
464 # Command line arguments for ipengine.
465 # Command line arguments for ipengine.
465 engine_args = List(
466 program_args = List(
466 ['--log-to-file','--log-level', str(logging.INFO)], config=True
467 ['--log-to-file','--log-level', str(logging.INFO)], config=True
467 )
468 )
468 n = Int(1, config=True)
469 n = Int(1, config=True)
469
470
470 def start(self, n, cluster_dir):
471 def start(self, n, cluster_dir):
471 """Start n engines by profile or cluster_dir."""
472 """Start n engines by profile or cluster_dir."""
472 self.engine_args.extend(['--cluster-dir', cluster_dir])
473 self.program_args.extend(['--cluster-dir', cluster_dir])
473 self.cluster_dir = unicode(cluster_dir)
474 self.cluster_dir = unicode(cluster_dir)
474 self.n = n
475 self.n = n
475 self.log.info('Starting MPIExecEngineSetLauncher: %r' % self.args)
476 self.log.info('Starting MPIExecEngineSetLauncher: %r' % self.args)
476 return super(MPIExecEngineSetLauncher, self).start(n)
477 return super(MPIExecEngineSetLauncher, self).start(n)
477
478
478 def find_args(self):
479 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
480 self.engine_cmd + self.engine_args
481
482
483 #-----------------------------------------------------------------------------
479 #-----------------------------------------------------------------------------
484 # SSH launchers
480 # SSH launchers
485 #-----------------------------------------------------------------------------
481 #-----------------------------------------------------------------------------
486
482
487 # TODO: Get SSH Launcher working again.
483 # TODO: Get SSH Launcher working again.
488
484
489 class SSHLauncher(LocalProcessLauncher):
485 class SSHLauncher(LocalProcessLauncher):
490 """A minimal launcher for ssh.
486 """A minimal launcher for ssh.
491
487
492 To be useful this will probably have to be extended to use the ``sshx``
488 To be useful this will probably have to be extended to use the ``sshx``
493 idea for environment variables. There could be other things this needs
489 idea for environment variables. There could be other things this needs
494 as well.
490 as well.
495 """
491 """
496
492
497 ssh_cmd = List(['ssh'], config=True)
493 ssh_cmd = List(['ssh'], config=True)
498 ssh_args = List(['-tt'], config=True)
494 ssh_args = List(['-tt'], config=True)
499 program = List(['date'], config=True)
495 program = List(['date'], config=True)
500 program_args = List([], config=True)
496 program_args = List([], config=True)
501 hostname = Str('', config=True)
497 hostname = Str('', config=True)
502 user = Str(os.environ.get('USER','username'), config=True)
498 user = Str('', config=True)
503 location = Str('')
499 location = Str('')
504
500
505 def _hostname_changed(self, name, old, new):
501 def _hostname_changed(self, name, old, new):
506 self.location = '%s@%s' % (self.user, new)
502 if self.user:
503 self.location = '%s@%s' % (self.user, new)
504 else:
505 self.location = new
507
506
508 def _user_changed(self, name, old, new):
507 def _user_changed(self, name, old, new):
509 self.location = '%s@%s' % (new, self.hostname)
508 self.location = '%s@%s' % (new, self.hostname)
510
509
511 def find_args(self):
510 def find_args(self):
512 return self.ssh_cmd + self.ssh_args + [self.location] + \
511 return self.ssh_cmd + self.ssh_args + [self.location] + \
513 self.program + self.program_args
512 self.program + self.program_args
514
513
515 def start(self, cluster_dir, hostname=None, user=None):
514 def start(self, cluster_dir, hostname=None, user=None):
516 print self.config
515 self.cluster_dir = unicode(cluster_dir)
517 if hostname is not None:
516 if hostname is not None:
518 self.hostname = hostname
517 self.hostname = hostname
519 if user is not None:
518 if user is not None:
520 self.user = user
519 self.user = user
521 print (self.location, hostname, user)
520
522 return super(SSHLauncher, self).start()
521 return super(SSHLauncher, self).start()
523
522
524 def signal(self, sig):
523 def signal(self, sig):
525 if self.state == 'running':
524 if self.state == 'running':
526 # send escaped ssh connection-closer
525 # send escaped ssh connection-closer
527 self.process.stdin.write('~.')
526 self.process.stdin.write('~.')
528 self.process.stdin.flush()
527 self.process.stdin.flush()
529
528
530
529
531
530
532 class SSHControllerLauncher(SSHLauncher):
531 class SSHControllerLauncher(SSHLauncher):
533
532
534 program = List(ipcontroller_cmd_argv, config=True)
533 program = List(ipcontroller_cmd_argv, config=True)
535 # Command line arguments to ipcontroller.
534 # Command line arguments to ipcontroller.
536 program_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
535 program_args = List(['-r', '--log-to-file','--log-level', str(logging.INFO)], config=True)
537
536
538
537
539 class SSHEngineLauncher(SSHLauncher):
538 class SSHEngineLauncher(SSHLauncher):
540 program = List(ipengine_cmd_argv, config=True)
539 program = List(ipengine_cmd_argv, config=True)
541 # Command line arguments for ipengine.
540 # Command line arguments for ipengine.
542 program_args = List(
541 program_args = List(
543 ['--log-to-file','--log-level', str(logging.INFO)], config=True
542 ['--log-to-file','--log-level', str(logging.INFO)], config=True
544 )
543 )
545
544
546 class SSHEngineSetLauncher(LocalEngineSetLauncher):
545 class SSHEngineSetLauncher(LocalEngineSetLauncher):
547 launcher_class = SSHEngineLauncher
546 launcher_class = SSHEngineLauncher
547 engines = Dict(config=True)
548
549 def start(self, n, cluster_dir):
550 """Start engines by profile or cluster_dir.
551 `n` is ignored, and the `engines` config property is used instead.
552 """
553
554 self.cluster_dir = unicode(cluster_dir)
555 dlist = []
556 for host, n in self.engines.iteritems():
557 if isinstance(n, (tuple, list)):
558 n, args = n
559 else:
560 args = copy.deepcopy(self.engine_args)
561
562 if '@' in host:
563 user,host = host.split('@',1)
564 else:
565 user=None
566 for i in range(n):
567 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
568
569 # Copy the engine args over to each engine launcher.
570 i
571 el.program_args = args
572 el.on_stop(self._notice_engine_stopped)
573 d = el.start(cluster_dir, user=user, hostname=host)
574 if i==0:
575 self.log.info("Starting SSHEngineSetLauncher: %r" % el.args)
576 self.launchers[host+str(i)] = el
577 dlist.append(d)
578 self.notify_start(dlist)
579 return dlist
580
548
581
549
582
550 #-----------------------------------------------------------------------------
583 #-----------------------------------------------------------------------------
551 # Windows HPC Server 2008 scheduler launchers
584 # Windows HPC Server 2008 scheduler launchers
552 #-----------------------------------------------------------------------------
585 #-----------------------------------------------------------------------------
553
586
554
587
555 # This is only used on Windows.
588 # This is only used on Windows.
556 def find_job_cmd():
589 def find_job_cmd():
557 if os.name=='nt':
590 if os.name=='nt':
558 try:
591 try:
559 return find_cmd('job')
592 return find_cmd('job')
560 except FindCmdError:
593 except FindCmdError:
561 return 'job'
594 return 'job'
562 else:
595 else:
563 return 'job'
596 return 'job'
564
597
565
598
566 class WindowsHPCLauncher(BaseLauncher):
599 class WindowsHPCLauncher(BaseLauncher):
567
600
568 # A regular expression used to get the job id from the output of the
601 # A regular expression used to get the job id from the output of the
569 # submit_command.
602 # submit_command.
570 job_id_regexp = Str(r'\d+', config=True)
603 job_id_regexp = Str(r'\d+', config=True)
571 # The filename of the instantiated job script.
604 # The filename of the instantiated job script.
572 job_file_name = Unicode(u'ipython_job.xml', config=True)
605 job_file_name = Unicode(u'ipython_job.xml', config=True)
573 # The full path to the instantiated job script. This gets made dynamically
606 # The full path to the instantiated job script. This gets made dynamically
574 # by combining the work_dir with the job_file_name.
607 # by combining the work_dir with the job_file_name.
575 job_file = Unicode(u'')
608 job_file = Unicode(u'')
576 # The hostname of the scheduler to submit the job to
609 # The hostname of the scheduler to submit the job to
577 scheduler = Str('', config=True)
610 scheduler = Str('', config=True)
578 job_cmd = Str(find_job_cmd(), config=True)
611 job_cmd = Str(find_job_cmd(), config=True)
579
612
580 def __init__(self, work_dir=u'.', config=None, **kwargs):
613 def __init__(self, work_dir=u'.', config=None, **kwargs):
581 super(WindowsHPCLauncher, self).__init__(
614 super(WindowsHPCLauncher, self).__init__(
582 work_dir=work_dir, config=config, **kwargs
615 work_dir=work_dir, config=config, **kwargs
583 )
616 )
584
617
585 @property
618 @property
586 def job_file(self):
619 def job_file(self):
587 return os.path.join(self.work_dir, self.job_file_name)
620 return os.path.join(self.work_dir, self.job_file_name)
588
621
589 def write_job_file(self, n):
622 def write_job_file(self, n):
590 raise NotImplementedError("Implement write_job_file in a subclass.")
623 raise NotImplementedError("Implement write_job_file in a subclass.")
591
624
592 def find_args(self):
625 def find_args(self):
593 return ['job.exe']
626 return ['job.exe']
594
627
595 def parse_job_id(self, output):
628 def parse_job_id(self, output):
596 """Take the output of the submit command and return the job id."""
629 """Take the output of the submit command and return the job id."""
597 m = re.search(self.job_id_regexp, output)
630 m = re.search(self.job_id_regexp, output)
598 if m is not None:
631 if m is not None:
599 job_id = m.group()
632 job_id = m.group()
600 else:
633 else:
601 raise LauncherError("Job id couldn't be determined: %s" % output)
634 raise LauncherError("Job id couldn't be determined: %s" % output)
602 self.job_id = job_id
635 self.job_id = job_id
603 self.log.info('Job started with job id: %r' % job_id)
636 self.log.info('Job started with job id: %r' % job_id)
604 return job_id
637 return job_id
605
638
606 def start(self, n):
639 def start(self, n):
607 """Start n copies of the process using the Win HPC job scheduler."""
640 """Start n copies of the process using the Win HPC job scheduler."""
608 self.write_job_file(n)
641 self.write_job_file(n)
609 args = [
642 args = [
610 'submit',
643 'submit',
611 '/jobfile:%s' % self.job_file,
644 '/jobfile:%s' % self.job_file,
612 '/scheduler:%s' % self.scheduler
645 '/scheduler:%s' % self.scheduler
613 ]
646 ]
614 self.log.info("Starting Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
647 self.log.info("Starting Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
615 # Twisted will raise DeprecationWarnings if we try to pass unicode to this
648 # Twisted will raise DeprecationWarnings if we try to pass unicode to this
616 output = check_output([self.job_cmd]+args,
649 output = check_output([self.job_cmd]+args,
617 env=os.environ,
650 env=os.environ,
618 cwd=self.work_dir,
651 cwd=self.work_dir,
619 stderr=STDOUT
652 stderr=STDOUT
620 )
653 )
621 job_id = self.parse_job_id(output)
654 job_id = self.parse_job_id(output)
622 # self.notify_start(job_id)
655 self.notify_start(job_id)
623 return job_id
656 return job_id
624
657
625 def stop(self):
658 def stop(self):
626 args = [
659 args = [
627 'cancel',
660 'cancel',
628 self.job_id,
661 self.job_id,
629 '/scheduler:%s' % self.scheduler
662 '/scheduler:%s' % self.scheduler
630 ]
663 ]
631 self.log.info("Stopping Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
664 self.log.info("Stopping Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
632 try:
665 try:
633 output = check_output([self.job_cmd]+args,
666 output = check_output([self.job_cmd]+args,
634 env=os.environ,
667 env=os.environ,
635 cwd=self.work_dir,
668 cwd=self.work_dir,
636 stderr=STDOUT
669 stderr=STDOUT
637 )
670 )
638 except:
671 except:
639 output = 'The job already appears to be stoppped: %r' % self.job_id
672 output = 'The job already appears to be stoppped: %r' % self.job_id
640 self.notify_stop(output) # Pass the output of the kill cmd
673 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
641 return output
674 return output
642
675
643
676
644 class WindowsHPCControllerLauncher(WindowsHPCLauncher):
677 class WindowsHPCControllerLauncher(WindowsHPCLauncher):
645
678
646 job_file_name = Unicode(u'ipcontroller_job.xml', config=True)
679 job_file_name = Unicode(u'ipcontroller_job.xml', config=True)
647 extra_args = List([], config=False)
680 extra_args = List([], config=False)
648
681
649 def write_job_file(self, n):
682 def write_job_file(self, n):
650 job = IPControllerJob(config=self.config)
683 job = IPControllerJob(config=self.config)
651
684
652 t = IPControllerTask(config=self.config)
685 t = IPControllerTask(config=self.config)
653 # The tasks work directory is *not* the actual work directory of
686 # The tasks work directory is *not* the actual work directory of
654 # the controller. It is used as the base path for the stdout/stderr
687 # the controller. It is used as the base path for the stdout/stderr
655 # files that the scheduler redirects to.
688 # files that the scheduler redirects to.
656 t.work_directory = self.cluster_dir
689 t.work_directory = self.cluster_dir
657 # Add the --cluster-dir and from self.start().
690 # Add the --cluster-dir and from self.start().
658 t.controller_args.extend(self.extra_args)
691 t.controller_args.extend(self.extra_args)
659 job.add_task(t)
692 job.add_task(t)
660
693
661 self.log.info("Writing job description file: %s" % self.job_file)
694 self.log.info("Writing job description file: %s" % self.job_file)
662 job.write(self.job_file)
695 job.write(self.job_file)
663
696
664 @property
697 @property
665 def job_file(self):
698 def job_file(self):
666 return os.path.join(self.cluster_dir, self.job_file_name)
699 return os.path.join(self.cluster_dir, self.job_file_name)
667
700
668 def start(self, cluster_dir):
701 def start(self, cluster_dir):
669 """Start the controller by cluster_dir."""
702 """Start the controller by cluster_dir."""
670 self.extra_args = ['--cluster-dir', cluster_dir]
703 self.extra_args = ['--cluster-dir', cluster_dir]
671 self.cluster_dir = unicode(cluster_dir)
704 self.cluster_dir = unicode(cluster_dir)
672 return super(WindowsHPCControllerLauncher, self).start(1)
705 return super(WindowsHPCControllerLauncher, self).start(1)
673
706
674
707
675 class WindowsHPCEngineSetLauncher(WindowsHPCLauncher):
708 class WindowsHPCEngineSetLauncher(WindowsHPCLauncher):
676
709
677 job_file_name = Unicode(u'ipengineset_job.xml', config=True)
710 job_file_name = Unicode(u'ipengineset_job.xml', config=True)
678 extra_args = List([], config=False)
711 extra_args = List([], config=False)
679
712
680 def write_job_file(self, n):
713 def write_job_file(self, n):
681 job = IPEngineSetJob(config=self.config)
714 job = IPEngineSetJob(config=self.config)
682
715
683 for i in range(n):
716 for i in range(n):
684 t = IPEngineTask(config=self.config)
717 t = IPEngineTask(config=self.config)
685 # The tasks work directory is *not* the actual work directory of
718 # The tasks work directory is *not* the actual work directory of
686 # the engine. It is used as the base path for the stdout/stderr
719 # the engine. It is used as the base path for the stdout/stderr
687 # files that the scheduler redirects to.
720 # files that the scheduler redirects to.
688 t.work_directory = self.cluster_dir
721 t.work_directory = self.cluster_dir
689 # Add the --cluster-dir and from self.start().
722 # Add the --cluster-dir and from self.start().
690 t.engine_args.extend(self.extra_args)
723 t.engine_args.extend(self.extra_args)
691 job.add_task(t)
724 job.add_task(t)
692
725
693 self.log.info("Writing job description file: %s" % self.job_file)
726 self.log.info("Writing job description file: %s" % self.job_file)
694 job.write(self.job_file)
727 job.write(self.job_file)
695
728
696 @property
729 @property
697 def job_file(self):
730 def job_file(self):
698 return os.path.join(self.cluster_dir, self.job_file_name)
731 return os.path.join(self.cluster_dir, self.job_file_name)
699
732
700 def start(self, n, cluster_dir):
733 def start(self, n, cluster_dir):
701 """Start the controller by cluster_dir."""
734 """Start the controller by cluster_dir."""
702 self.extra_args = ['--cluster-dir', cluster_dir]
735 self.extra_args = ['--cluster-dir', cluster_dir]
703 self.cluster_dir = unicode(cluster_dir)
736 self.cluster_dir = unicode(cluster_dir)
704 return super(WindowsHPCEngineSetLauncher, self).start(n)
737 return super(WindowsHPCEngineSetLauncher, self).start(n)
705
738
706
739
707 #-----------------------------------------------------------------------------
740 #-----------------------------------------------------------------------------
708 # Batch (PBS) system launchers
741 # Batch (PBS) system launchers
709 #-----------------------------------------------------------------------------
742 #-----------------------------------------------------------------------------
710
743
711 # TODO: Get PBS launcher working again.
712
713 class BatchSystemLauncher(BaseLauncher):
744 class BatchSystemLauncher(BaseLauncher):
714 """Launch an external process using a batch system.
745 """Launch an external process using a batch system.
715
746
716 This class is designed to work with UNIX batch systems like PBS, LSF,
747 This class is designed to work with UNIX batch systems like PBS, LSF,
717 GridEngine, etc. The overall model is that there are different commands
748 GridEngine, etc. The overall model is that there are different commands
718 like qsub, qdel, etc. that handle the starting and stopping of the process.
749 like qsub, qdel, etc. that handle the starting and stopping of the process.
719
750
720 This class also has the notion of a batch script. The ``batch_template``
751 This class also has the notion of a batch script. The ``batch_template``
721 attribute can be set to a string that is a template for the batch script.
752 attribute can be set to a string that is a template for the batch script.
722 This template is instantiated using Itpl. Thus the template can use
753 This template is instantiated using Itpl. Thus the template can use
723 ${n} fot the number of instances. Subclasses can add additional variables
754 ${n} fot the number of instances. Subclasses can add additional variables
724 to the template dict.
755 to the template dict.
725 """
756 """
726
757
727 # Subclasses must fill these in. See PBSEngineSet
758 # Subclasses must fill these in. See PBSEngineSet
728 # The name of the command line program used to submit jobs.
759 # The name of the command line program used to submit jobs.
729 submit_command = Str('', config=True)
760 submit_command = Str('', config=True)
730 # The name of the command line program used to delete jobs.
761 # The name of the command line program used to delete jobs.
731 delete_command = Str('', config=True)
762 delete_command = Str('', config=True)
732 # A regular expression used to get the job id from the output of the
763 # A regular expression used to get the job id from the output of the
733 # submit_command.
764 # submit_command.
734 job_id_regexp = Str('', config=True)
765 job_id_regexp = Str('', config=True)
735 # The string that is the batch script template itself.
766 # The string that is the batch script template itself.
736 batch_template = Str('', config=True)
767 batch_template = Str('', config=True)
737 # The filename of the instantiated batch script.
768 # The filename of the instantiated batch script.
738 batch_file_name = Unicode(u'batch_script', config=True)
769 batch_file_name = Unicode(u'batch_script', config=True)
739 # The full path to the instantiated batch script.
770 # The full path to the instantiated batch script.
740 batch_file = Unicode(u'')
771 batch_file = Unicode(u'')
741 # the format dict used with batch_template:
772 # the format dict used with batch_template:
742 context = Dict()
773 context = Dict()
743
774
744
775
745 def find_args(self):
776 def find_args(self):
746 return [self.submit_command]
777 return [self.submit_command, self.batch_file]
747
778
748 def __init__(self, work_dir=u'.', config=None, **kwargs):
779 def __init__(self, work_dir=u'.', config=None, **kwargs):
749 super(BatchSystemLauncher, self).__init__(
780 super(BatchSystemLauncher, self).__init__(
750 work_dir=work_dir, config=config, **kwargs
781 work_dir=work_dir, config=config, **kwargs
751 )
782 )
752 self.batch_file = os.path.join(self.work_dir, self.batch_file_name)
783 self.batch_file = os.path.join(self.work_dir, self.batch_file_name)
753
784
754 def parse_job_id(self, output):
785 def parse_job_id(self, output):
755 """Take the output of the submit command and return the job id."""
786 """Take the output of the submit command and return the job id."""
756 m = re.match(self.job_id_regexp, output)
787 m = re.search(self.job_id_regexp, output)
757 if m is not None:
788 if m is not None:
758 job_id = m.group()
789 job_id = m.group()
759 else:
790 else:
760 raise LauncherError("Job id couldn't be determined: %s" % output)
791 raise LauncherError("Job id couldn't be determined: %s" % output)
761 self.job_id = job_id
792 self.job_id = job_id
762 self.log.info('Job started with job id: %r' % job_id)
793 self.log.info('Job submitted with job id: %r' % job_id)
763 return job_id
794 return job_id
764
795
765 def write_batch_script(self, n):
796 def write_batch_script(self, n):
766 """Instantiate and write the batch script to the work_dir."""
797 """Instantiate and write the batch script to the work_dir."""
767 self.context['n'] = n
798 self.context['n'] = n
768 script_as_string = Itpl.itplns(self.batch_template, self.context)
799 script_as_string = Itpl.itplns(self.batch_template, self.context)
769 self.log.info('Writing instantiated batch script: %s' % self.batch_file)
800 self.log.info('Writing instantiated batch script: %s' % self.batch_file)
770 f = open(self.batch_file, 'w')
801 f = open(self.batch_file, 'w')
771 f.write(script_as_string)
802 f.write(script_as_string)
772 f.close()
803 f.close()
773
804
774 def start(self, n, cluster_dir):
805 def start(self, n, cluster_dir):
775 """Start n copies of the process using a batch system."""
806 """Start n copies of the process using a batch system."""
776 # Here we save profile and cluster_dir in the context so they
807 # Here we save profile and cluster_dir in the context so they
777 # can be used in the batch script template as ${profile} and
808 # can be used in the batch script template as ${profile} and
778 # ${cluster_dir}
809 # ${cluster_dir}
779 self.context['cluster_dir'] = cluster_dir
810 self.context['cluster_dir'] = cluster_dir
780 self.cluster_dir = unicode(cluster_dir)
811 self.cluster_dir = unicode(cluster_dir)
781 self.write_batch_script(n)
812 self.write_batch_script(n)
782 output = check_output([self.submit_command, self.batch_file], env=os.environ, stdout=STDOUT)
813 output = check_output(self.args, env=os.environ)
814
783 job_id = self.parse_job_id(output)
815 job_id = self.parse_job_id(output)
784 # self.notify_start(job_id)
816 self.notify_start(job_id)
785 return job_id
817 return job_id
786
818
787 def stop(self):
819 def stop(self):
788 output = check_output([self.delete_command, self.job_id], env=os.environ, stderr=STDOUT)
820 output = check_output([self.delete_command, self.job_id], env=os.environ)
789 self.notify_stop(output) # Pass the output of the kill cmd
821 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
790 return output
822 return output
791
823
792
824
793 class PBSLauncher(BatchSystemLauncher):
825 class PBSLauncher(BatchSystemLauncher):
794 """A BatchSystemLauncher subclass for PBS."""
826 """A BatchSystemLauncher subclass for PBS."""
795
827
796 submit_command = Str('qsub', config=True)
828 submit_command = Str('qsub', config=True)
797 delete_command = Str('qdel', config=True)
829 delete_command = Str('qdel', config=True)
798 job_id_regexp = Str(r'\d+', config=True)
830 job_id_regexp = Str(r'\d+', config=True)
799 batch_template = Str('', config=True)
831 batch_template = Str('', config=True)
800 batch_file_name = Unicode(u'pbs_batch_script', config=True)
832 batch_file_name = Unicode(u'pbs_batch_script', config=True)
801 batch_file = Unicode(u'')
833 batch_file = Unicode(u'')
802
834
803
835
804 class PBSControllerLauncher(PBSLauncher):
836 class PBSControllerLauncher(PBSLauncher):
805 """Launch a controller using PBS."""
837 """Launch a controller using PBS."""
806
838
807 batch_file_name = Unicode(u'pbs_batch_script_controller', config=True)
839 batch_file_name = Unicode(u'pbs_batch_script_controller', config=True)
808
840
809 def start(self, cluster_dir):
841 def start(self, cluster_dir):
810 """Start the controller by profile or cluster_dir."""
842 """Start the controller by profile or cluster_dir."""
811 self.log.info("Starting PBSControllerLauncher: %r" % self.args)
843 self.log.info("Starting PBSControllerLauncher: %r" % self.args)
812 return super(PBSControllerLauncher, self).start(1, cluster_dir)
844 return super(PBSControllerLauncher, self).start(1, cluster_dir)
813
845
814
846
815 class PBSEngineSetLauncher(PBSLauncher):
847 class PBSEngineSetLauncher(PBSLauncher):
816
848
817 batch_file_name = Unicode(u'pbs_batch_script_engines', config=True)
849 batch_file_name = Unicode(u'pbs_batch_script_engines', config=True)
818
850
819 def start(self, n, cluster_dir):
851 def start(self, n, cluster_dir):
820 """Start n engines by profile or cluster_dir."""
852 """Start n engines by profile or cluster_dir."""
821 self.log.info('Starting PBSEngineSetLauncher: %r' % self.args)
853 self.log.info('Starting PBSEngineSetLauncher: %r' % self.args)
822 return super(PBSEngineSetLauncher, self).start(n, cluster_dir)
854 return super(PBSEngineSetLauncher, self).start(n, cluster_dir)
823
855
824
856
825 #-----------------------------------------------------------------------------
857 #-----------------------------------------------------------------------------
826 # A launcher for ipcluster itself!
858 # A launcher for ipcluster itself!
827 #-----------------------------------------------------------------------------
859 #-----------------------------------------------------------------------------
828
860
829
861
830 class IPClusterLauncher(LocalProcessLauncher):
862 class IPClusterLauncher(LocalProcessLauncher):
831 """Launch the ipcluster program in an external process."""
863 """Launch the ipcluster program in an external process."""
832
864
833 ipcluster_cmd = List(ipcluster_cmd_argv, config=True)
865 ipcluster_cmd = List(ipcluster_cmd_argv, config=True)
834 # Command line arguments to pass to ipcluster.
866 # Command line arguments to pass to ipcluster.
835 ipcluster_args = List(
867 ipcluster_args = List(
836 ['--clean-logs', '--log-to-file', '--log-level', str(logging.INFO)], config=True)
868 ['--clean-logs', '--log-to-file', '--log-level', str(logging.INFO)], config=True)
837 ipcluster_subcommand = Str('start')
869 ipcluster_subcommand = Str('start')
838 ipcluster_n = Int(2)
870 ipcluster_n = Int(2)
839
871
840 def find_args(self):
872 def find_args(self):
841 return self.ipcluster_cmd + [self.ipcluster_subcommand] + \
873 return self.ipcluster_cmd + [self.ipcluster_subcommand] + \
842 ['-n', repr(self.ipcluster_n)] + self.ipcluster_args
874 ['-n', repr(self.ipcluster_n)] + self.ipcluster_args
843
875
844 def start(self):
876 def start(self):
845 self.log.info("Starting ipcluster: %r" % self.args)
877 self.log.info("Starting ipcluster: %r" % self.args)
846 return super(IPClusterLauncher, self).start()
878 return super(IPClusterLauncher, self).start()
847
879
@@ -1,403 +1,490 b''
1 .. _parallel_process:
1 .. _parallel_process:
2
2
3 ===========================================
3 ===========================================
4 Starting the IPython controller and engines
4 Starting the IPython controller and engines
5 ===========================================
5 ===========================================
6
6
7 To use IPython for parallel computing, you need to start one instance of
7 To use IPython for parallel computing, you need to start one instance of
8 the controller and one or more instances of the engine. The controller
8 the controller and one or more instances of the engine. The controller
9 and each engine can run on different machines or on the same machine.
9 and each engine can run on different machines or on the same machine.
10 Because of this, there are many different possibilities.
10 Because of this, there are many different possibilities.
11
11
12 Broadly speaking, there are two ways of going about starting a controller and engines:
12 Broadly speaking, there are two ways of going about starting a controller and engines:
13
13
14 * In an automated manner using the :command:`ipclusterz` command.
14 * In an automated manner using the :command:`ipclusterz` command.
15 * In a more manual way using the :command:`ipcontrollerz` and
15 * In a more manual way using the :command:`ipcontrollerz` and
16 :command:`ipenginez` commands.
16 :command:`ipenginez` commands.
17
17
18 This document describes both of these methods. We recommend that new users
18 This document describes both of these methods. We recommend that new users
19 start with the :command:`ipclusterz` command as it simplifies many common usage
19 start with the :command:`ipclusterz` command as it simplifies many common usage
20 cases.
20 cases.
21
21
22 General considerations
22 General considerations
23 ======================
23 ======================
24
24
25 Before delving into the details about how you can start a controller and
25 Before delving into the details about how you can start a controller and
26 engines using the various methods, we outline some of the general issues that
26 engines using the various methods, we outline some of the general issues that
27 come up when starting the controller and engines. These things come up no
27 come up when starting the controller and engines. These things come up no
28 matter which method you use to start your IPython cluster.
28 matter which method you use to start your IPython cluster.
29
29
30 Let's say that you want to start the controller on ``host0`` and engines on
30 Let's say that you want to start the controller on ``host0`` and engines on
31 hosts ``host1``-``hostn``. The following steps are then required:
31 hosts ``host1``-``hostn``. The following steps are then required:
32
32
33 1. Start the controller on ``host0`` by running :command:`ipcontrollerz` on
33 1. Start the controller on ``host0`` by running :command:`ipcontrollerz` on
34 ``host0``.
34 ``host0``.
35 2. Move the JSON file (:file:`ipcontroller-engine.json`) created by the
35 2. Move the JSON file (:file:`ipcontroller-engine.json`) created by the
36 controller from ``host0`` to hosts ``host1``-``hostn``.
36 controller from ``host0`` to hosts ``host1``-``hostn``.
37 3. Start the engines on hosts ``host1``-``hostn`` by running
37 3. Start the engines on hosts ``host1``-``hostn`` by running
38 :command:`ipenginez`. This command has to be told where the JSON file
38 :command:`ipenginez`. This command has to be told where the JSON file
39 (:file:`ipcontroller-engine.json`) is located.
39 (:file:`ipcontroller-engine.json`) is located.
40
40
41 At this point, the controller and engines will be connected. By default, the JSON files
41 At this point, the controller and engines will be connected. By default, the JSON files
42 created by the controller are put into the :file:`~/.ipython/clusterz_default/security`
42 created by the controller are put into the :file:`~/.ipython/clusterz_default/security`
43 directory. If the engines share a filesystem with the controller, step 2 can be skipped as
43 directory. If the engines share a filesystem with the controller, step 2 can be skipped as
44 the engines will automatically look at that location.
44 the engines will automatically look at that location.
45
45
46 The final step required to actually use the running controller from a client is to move
46 The final step required to actually use the running controller from a client is to move
47 the JSON file :file:`ipcontroller-client.json` from ``host0`` to any host where clients
47 the JSON file :file:`ipcontroller-client.json` from ``host0`` to any host where clients
48 will be run. If these file are put into the :file:`~/.ipython/clusterz_default/security`
48 will be run. If these file are put into the :file:`~/.ipython/clusterz_default/security`
49 directory of the client's host, they will be found automatically. Otherwise, the full path
49 directory of the client's host, they will be found automatically. Otherwise, the full path
50 to them has to be passed to the client's constructor.
50 to them has to be passed to the client's constructor.
51
51
52 Using :command:`ipclusterz`
52 Using :command:`ipclusterz`
53 ==========================
53 ==========================
54
54
55 The :command:`ipclusterz` command provides a simple way of starting a
55 The :command:`ipclusterz` command provides a simple way of starting a
56 controller and engines in the following situations:
56 controller and engines in the following situations:
57
57
58 1. When the controller and engines are all run on localhost. This is useful
58 1. When the controller and engines are all run on localhost. This is useful
59 for testing or running on a multicore computer.
59 for testing or running on a multicore computer.
60 2. When engines are started using the :command:`mpirun` command that comes
60 2. When engines are started using the :command:`mpirun` command that comes
61 with most MPI [MPI]_ implementations
61 with most MPI [MPI]_ implementations
62 3. When engines are started using the PBS [PBS]_ batch system.
62 3. When engines are started using the PBS [PBS]_ batch system
63 (or other `qsub` systems, such as SGE).
63 4. When the controller is started on localhost and the engines are started on
64 4. When the controller is started on localhost and the engines are started on
64 remote nodes using :command:`ssh`.
65 remote nodes using :command:`ssh`.
65
66 5. When engines are started using the Windows HPC Server batch system.
66 .. note::
67
68 It is also possible for advanced users to add support to
69 :command:`ipclusterz` for starting controllers and engines using other
70 methods (like Sun's Grid Engine for example).
71
67
72 .. note::
68 .. note::
73
69
74 Currently :command:`ipclusterz` requires that the
70 Currently :command:`ipclusterz` requires that the
75 :file:`~/.ipython/cluster_<profile>/security` directory live on a shared filesystem that is
71 :file:`~/.ipython/cluster_<profile>/security` directory live on a shared filesystem that is
76 seen by both the controller and engines. If you don't have a shared file
72 seen by both the controller and engines. If you don't have a shared file
77 system you will need to use :command:`ipcontrollerz` and
73 system you will need to use :command:`ipcontrollerz` and
78 :command:`ipenginez` directly. This constraint can be relaxed if you are
74 :command:`ipenginez` directly.
79 using the :command:`ssh` method to start the cluster.
80
75
81 Under the hood, :command:`ipclusterz` just uses :command:`ipcontrollerz`
76 Under the hood, :command:`ipclusterz` just uses :command:`ipcontrollerz`
82 and :command:`ipenginez` to perform the steps described above.
77 and :command:`ipenginez` to perform the steps described above.
83
78
84 Using :command:`ipclusterz` in local mode
79 The simplest way to use ipclusterz requires no configuration, and will
85 ----------------------------------------
80 launch a controller and a number of engines on the local machine. For instance,
86
81 to start one controller and 4 engines on localhost, just do::
87 To start one controller and 4 engines on localhost, just do::
88
82
89 $ ipclusterz start -n 4
83 $ ipclusterz start -n 4
90
84
91 To see other command line options for the local mode, do::
85 To see other command line options for the local mode, do::
92
86
93 $ ipclusterz -h
87 $ ipclusterz -h
94
88
95 .. note::
96
89
97 The remainder of this section refers to the 0.10 clusterfile model, no longer in use.
90 Configuring an IPython cluster
98 skip to
91 ==============================
99
92
100 Using :command:`ipclusterz` in mpiexec/mpirun mode
93 Cluster configurations are stored as `profiles`. You can create a new profile with::
101 -------------------------------------------------
94
95 $ ipclusterz create -p myprofile
96
97 This will create the directory :file:`IPYTHONDIR/clusterz_myprofile`, and populate it
98 with the default configuration files for the three IPython cluster commands. Once
99 you edit those files, you can continue to call ipclusterz/ipcontrollerz/ipenginez
100 with no arguments beyond ``-p myprofile``, and any configuration will be maintained.
101
102 There is no limit to the number of profiles you can have, so you can maintain a profile for each
103 of your common use cases. The default profile will be used whenever the
104 profile argument is not specified, so edit :file:`IPYTHONDIR/clusterz_default/*_config.py` to
105 represent your most common use case.
106
107 The configuration files are loaded with commented-out settings and explanations,
108 which should cover most of the available possibilities.
109
110 Using various batch systems with :command:`ipclusterz`
111 ------------------------------------------------------
112
113 :command:`ipclusterz` has a notion of Launchers that can start controllers
114 and engines with various remote execution schemes. Currently supported
115 models include `mpiexec`, PBS-style (Torque, SGE), and Windows HPC Server.
102
116
103 .. note::
117 .. note::
104
118
105 This section is out of date for IPython 0.11
119 The Launchers and configuration are designed in such a way that advanced
120 users can subclass and configure them to fit their own system that we
121 have not yet supported (such as Condor)
122
123 Using :command:`ipclusterz` in mpiexec/mpirun mode
124 --------------------------------------------------
106
125
107
126
108 The mpiexec/mpirun mode is useful if you:
127 The mpiexec/mpirun mode is useful if you:
109
128
110 1. Have MPI installed.
129 1. Have MPI installed.
111 2. Your systems are configured to use the :command:`mpiexec` or
130 2. Your systems are configured to use the :command:`mpiexec` or
112 :command:`mpirun` commands to start MPI processes.
131 :command:`mpirun` commands to start MPI processes.
113
132
114 .. note::
133 If these are satisfied, you can create a new profile::
134
135 $ ipclusterz create -p mpi
136
137 and edit the file :file:`IPYTHONDIR/clusterz_mpi/ipclusterz_config.py`.
115
138
116 The preferred command to use is :command:`mpiexec`. However, we also
139 There, instruct ipclusterz to use the MPIExec launchers by adding the lines:
117 support :command:`mpirun` for backwards compatibility. The underlying
118 logic used is exactly the same, the only difference being the name of the
119 command line program that is called.
120
140
121 If these are satisfied, you can start an IPython cluster using::
141 .. sourcecode:: python
142
143 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.MPIExecEngineSetLauncher'
144
145 If the default MPI configuration is correct, then you can now start your cluster, with::
122
146
123 $ ipclusterz mpiexec -n 4
147 $ ipclusterz start -n 4 -p mpi
124
148
125 This does the following:
149 This does the following:
126
150
127 1. Starts the IPython controller on current host.
151 1. Starts the IPython controller on current host.
128 2. Uses :command:`mpiexec` to start 4 engines.
152 2. Uses :command:`mpiexec` to start 4 engines.
129
153
154 If you have a reason to also start the Controller with mpi, you can specify:
155
156 .. sourcecode:: python
157
158 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.MPIExecControllerLauncher'
159
160 .. note::
161
162 The Controller *will not* be in the same MPI universe as the engines, so there is not
163 much reason to do this unless sysadmins demand it.
164
130 On newer MPI implementations (such as OpenMPI), this will work even if you
165 On newer MPI implementations (such as OpenMPI), this will work even if you
131 don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI
166 don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI
132 implementations actually require each process to call :func:`MPI_Init` upon
167 implementations actually require each process to call :func:`MPI_Init` upon
133 starting. The easiest way of having this done is to install the mpi4py
168 starting. The easiest way of having this done is to install the mpi4py
134 [mpi4py]_ package and then call ipclusterz with the ``--mpi`` option::
169 [mpi4py]_ package and then specify the ``c.MPI.use`` option in :file:`ipenginez_config.py`:
170
171 .. sourcecode:: python
135
172
136 $ ipclusterz mpiexec -n 4 --mpi=mpi4py
173 c.MPI.use = 'mpi4py'
137
174
138 Unfortunately, even this won't work for some MPI implementations. If you are
175 Unfortunately, even this won't work for some MPI implementations. If you are
139 having problems with this, you will likely have to use a custom Python
176 having problems with this, you will likely have to use a custom Python
140 executable that itself calls :func:`MPI_Init` at the appropriate time.
177 executable that itself calls :func:`MPI_Init` at the appropriate time.
141 Fortunately, mpi4py comes with such a custom Python executable that is easy to
178 Fortunately, mpi4py comes with such a custom Python executable that is easy to
142 install and use. However, this custom Python executable approach will not work
179 install and use. However, this custom Python executable approach will not work
143 with :command:`ipclusterz` currently.
180 with :command:`ipclusterz` currently.
144
181
145 Additional command line options for this mode can be found by doing::
146
147 $ ipclusterz mpiexec -h
148
149 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
182 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
150
183
151
184
152 Using :command:`ipclusterz` in PBS mode
185 Using :command:`ipclusterz` in PBS mode
153 --------------------------------------
186 ---------------------------------------
154
187
155 .. note::
188 The PBS mode uses the Portable Batch System [PBS]_ to start the engines.
189
190 As usual, we will start by creating a fresh profile::
191
192 $ ipclusterz create -p pbs
193
194 And in :file:`ipclusterz_config.py`, we will select the PBS launchers for the controller
195 and engines:
156
196
157 This section is out of date for IPython 0.11
197 .. sourcecode:: python
158
198
199 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.PBSControllerLauncher'
200 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
159
201
160 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To
202 To use this mode, you first need to create a PBS script template that will be
161 use this mode, you first need to create a PBS script template that will be
162 used to start the engines. Here is a sample PBS script template:
203 used to start the engines. Here is a sample PBS script template:
163
204
164 .. sourcecode:: bash
205 .. sourcecode:: bash
165
206
166 #PBS -N ipython
207 #PBS -N ipython
167 #PBS -j oe
208 #PBS -j oe
168 #PBS -l walltime=00:10:00
209 #PBS -l walltime=00:10:00
169 #PBS -l nodes=${n/4}:ppn=4
210 #PBS -l nodes=${n/4}:ppn=4
170 #PBS -q parallel
211 #PBS -q parallel
171
212
172 cd $$PBS_O_WORKDIR
213 cd $$PBS_O_WORKDIR
173 export PATH=$$HOME/usr/local/bin
214 export PATH=$$HOME/usr/local/bin
174 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
215 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
175 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
216 /usr/local/bin/mpiexec -n ${n} ipenginez --cluster_dir=${cluster_dir}
176
217
177 There are a few important points about this template:
218 There are a few important points about this template:
178
219
179 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
220 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
180 template engine.
221 template engine.
181
222
182 2. Instead of putting in the actual number of engines, use the notation
223 2. Instead of putting in the actual number of engines, use the notation
183 ``${n}`` to indicate the number of engines to be started. You can also uses
224 ``${n}`` to indicate the number of engines to be started. You can also uses
184 expressions like ``${n/4}`` in the template to indicate the number of
225 expressions like ``${n/4}`` in the template to indicate the number of
185 nodes.
226 nodes. There will always be a ${n} and ${cluster_dir} variable passed to the template.
227 These allow the batch system to know how many engines, and where the configuration
228 files reside.
186
229
187 3. Because ``$`` is a special character used by the template engine, you must
230 3. Because ``$`` is a special character used by the template engine, you must
188 escape any ``$`` by using ``$$``. This is important when referring to
231 escape any ``$`` by using ``$$``. This is important when referring to
189 environment variables in the template.
232 environment variables in the template.
190
233
191 4. Any options to :command:`ipenginez` should be given in the batch script
234 4. Any options to :command:`ipenginez` can be given in the batch script
192 template.
235 template, or in :file:`ipenginez_config.py`.
193
236
194 5. Depending on the configuration of you system, you may have to set
237 5. Depending on the configuration of you system, you may have to set
195 environment variables in the script template.
238 environment variables in the script template.
196
239
197 Once you have created such a script, save it with a name like
240 The controller template should be similar, but simpler:
198 :file:`pbs.template`. Now you are ready to start your job::
241
242 .. sourcecode:: bash
243
244 #PBS -N ipython
245 #PBS -j oe
246 #PBS -l walltime=00:10:00
247 #PBS -l nodes=1:ppn=4
248 #PBS -q parallel
199
249
200 $ ipclusterz pbs -n 128 --pbs-script=pbs.template
250 cd $$PBS_O_WORKDIR
251 export PATH=$$HOME/usr/local/bin
252 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
253 ipcontrollerz --cluster_dir=${cluster_dir}
201
254
202 Additional command line options for this mode can be found by doing::
203
255
204 $ ipclusterz pbs -h
256 Once you have created these scripts, save them with names like
257 :file:`pbs.engine.template`. Now you can load them into the :file:`ipclusterz_config` with:
205
258
206 Using :command:`ipclusterz` in SSH mode
259 .. sourcecode:: python
207 --------------------------------------
260
261 with open("pbs.engine.template") as f:
262 c.PBSEngineSetLauncher.batch_template = f.read()
263
264 with open("pbs.controller.template") as f:
265 c.PBSControllerLauncher.batch_template = f.read()
266
267
268 Alternately, you can just define the templates as strings inside :file:`ipclusterz_config`.
269
270 Note that assuming you are running PBS on a multi-node cluster, the Controller's default behavior
271 of listening only on localhost is likely too restrictive. In this case, also assuming the
272 nodes are safely behind a firewall, you can simply instruct the Controller to listen for
273 connections on all its interfaces, by adding in :file:`ipcontrollerz_config`:
274
275 .. sourcecode:: python
276
277 c.HubFactory.client_ip = '*'
278 c.HubFactory.engine_ip = '*'
279
280 You can now run the cluster with::
281
282 $ ipclusterz start -p pbs -n 128
283
284 Additional configuration options can be found in the PBS section of :file:`ipclusterz_config`.
208
285
209 .. note::
286 .. note::
210
287
211 This section is out of date for IPython 0.11
288 Due to the flexibility of configuration, the PBS launchers work with simple changes
289 to the template for other :command:`qsub`-using systems, such as Sun Grid Engine,
290 and with further configuration in similar batch systems like Condor.
291
292
293 Using :command:`ipclusterz` in SSH mode
294 ---------------------------------------
212
295
213
296
214 The SSH mode uses :command:`ssh` to execute :command:`ipenginez` on remote
297 The SSH mode uses :command:`ssh` to execute :command:`ipenginez` on remote
215 nodes and the :command:`ipcontrollerz` on localhost.
298 nodes and :command:`ipcontrollerz` can be run remotely as well, or on localhost.
216
299
217 When using using this mode it highly recommended that you have set up SSH keys
300 .. note::
218 and are using ssh-agent [SSH]_ for password-less logins.
219
301
220 To use this mode you need a python file describing the cluster, here is an
302 When using this mode it highly recommended that you have set up SSH keys
221 example of such a "clusterfile":
303 and are using ssh-agent [SSH]_ for password-less logins.
222
304
223 .. sourcecode:: python
305 As usual, we start by creating a clean profile::
224
225 send_furl = True
226 engines = { 'host1.example.com' : 2,
227 'host2.example.com' : 5,
228 'host3.example.com' : 1,
229 'host4.example.com' : 8 }
230
306
231 Since this is a regular python file usual python syntax applies. Things to
307 $ ipclusterz create -p ssh
232 note:
233
308
234 * The `engines` dict, where the keys is the host we want to run engines on and
309 To use this mode, select the SSH launchers in :file:`ipclusterz_config.py`:
235 the value is the number of engines to run on that host.
236 * send_furl can either be `True` or `False`, if `True` it will copy over the
237 furl needed for :command:`ipenginez` to each host.
238
310
239 The ``--clusterfile`` command line option lets you specify the file to use for
311 .. sourcecode:: python
240 the cluster definition. Once you have your cluster file and you can
241 :command:`ssh` into the remote hosts with out an password you are ready to
242 start your cluster like so:
243
312
244 .. sourcecode:: bash
313 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
314 # and if the Controller is also to be remote:
315 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.SSHControllerLauncher'
316
245
317
246 $ ipclusterz ssh --clusterfile /path/to/my/clusterfile.py
318 The controller's remote location and configuration can be specified:
247
319
320 .. sourcecode:: python
248
321
249 Two helper shell scripts are used to start and stop :command:`ipenginez` on
322 # Set the user and hostname for the controller
250 remote hosts:
323 # c.SSHControllerLauncher.hostname = 'controller.example.com'
324 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
251
325
252 * sshx.sh
326 # Set the arguments to be passed to ipcontrollerz
253 * engine_killer.sh
327 # note that remotely launched ipcontrollerz will not get the contents of
328 # the local ipcontrollerz_config.py unless it resides on the *remote host*
329 # in the location specified by the --cluster_dir argument.
330 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
254
331
255 Defaults for both of these are contained in the source code for
332 .. note::
256 :command:`ipclusterz`. The default scripts are written to a local file in a
257 tmep directory and then copied to a temp directory on the remote host and
258 executed from there. On most Unix, Linux and OS X systems this is /tmp.
259
333
260 The default sshx.sh is the following:
334 SSH mode does not do any file movement, so you will need to distribute configuration
335 files manually. To aid in this, the `reuse_files` flag defaults to True for ssh-launched
336 Controllers, so you will only need to do this once, unless you override this flag back
337 to False.
261
338
262 .. sourcecode:: bash
339 Engines are specified in a dictionary, by hostname and the number of engines to be run
340 on that host.
263
341
264 #!/bin/sh
342 .. sourcecode:: python
265 "$@" &> /dev/null &
343
266 echo $!
344 c.SSHEngineSetLauncher.engines = { 'host1.example.com' : 2,
345 'host2.example.com' : 5,
346 'host3.example.com' : (1, ['--cluster_dir', '/home/different/location']),
347 'host4.example.com' : 8 }
267
348
268 If you want to use a custom sshx.sh script you need to use the ``--sshx``
349 * The `engines` dict, where the keys are the host we want to run engines on and
269 option and specify the file to use. Using a custom sshx.sh file could be
350 the value is the number of engines to run on that host.
270 helpful when you need to setup the environment on the remote host before
351 * on host3, the value is a tuple, where the number of engines is first, and the arguments
271 executing :command:`ipenginez`.
352 to be passed to :command:`ipenginez` are the second element.
272
353
273 For a detailed options list:
354 For engines without explicitly specified arguments, the default arguments are set in
355 a single location:
274
356
275 .. sourcecode:: bash
357 .. sourcecode:: python
276
358
277 $ ipclusterz ssh -h
359 c.SSHEngineSetLauncher.engine_args = ['--cluster_dir', '/path/to/clusterz_ssh']
278
360
279 Current limitations of the SSH mode of :command:`ipclusterz` are:
361 Current limitations of the SSH mode of :command:`ipclusterz` are:
280
362
281 * Untested on Windows. Would require a working :command:`ssh` on Windows.
363 * Untested on Windows. Would require a working :command:`ssh` on Windows.
282 Also, we are using shell scripts to setup and execute commands on remote
364 Also, we are using shell scripts to setup and execute commands on remote
283 hosts.
365 hosts.
284 * :command:`ipcontrollerz` is started on localhost, with no option to start it
366 * No file movement -
285 on a remote node.
286
367
287 Using the :command:`ipcontrollerz` and :command:`ipenginez` commands
368 Using the :command:`ipcontrollerz` and :command:`ipenginez` commands
288 ====================================================================
369 ====================================================================
289
370
290 It is also possible to use the :command:`ipcontrollerz` and :command:`ipenginez`
371 It is also possible to use the :command:`ipcontrollerz` and :command:`ipenginez`
291 commands to start your controller and engines. This approach gives you full
372 commands to start your controller and engines. This approach gives you full
292 control over all aspects of the startup process.
373 control over all aspects of the startup process.
293
374
294 Starting the controller and engine on your local machine
375 Starting the controller and engine on your local machine
295 --------------------------------------------------------
376 --------------------------------------------------------
296
377
297 To use :command:`ipcontrollerz` and :command:`ipenginez` to start things on your
378 To use :command:`ipcontrollerz` and :command:`ipenginez` to start things on your
298 local machine, do the following.
379 local machine, do the following.
299
380
300 First start the controller::
381 First start the controller::
301
382
302 $ ipcontrollerz
383 $ ipcontrollerz
303
384
304 Next, start however many instances of the engine you want using (repeatedly)
385 Next, start however many instances of the engine you want using (repeatedly)
305 the command::
386 the command::
306
387
307 $ ipenginez
388 $ ipenginez
308
389
309 The engines should start and automatically connect to the controller using the
390 The engines should start and automatically connect to the controller using the
310 JSON files in :file:`~/.ipython/cluster_<profile>/security`. You are now ready to use the
391 JSON files in :file:`~/.ipython/clusterz_default/security`. You are now ready to use the
311 controller and engines from IPython.
392 controller and engines from IPython.
312
393
313 .. warning::
394 .. warning::
314
395
315 The order of the above operations may be important. You *must*
396 The order of the above operations may be important. You *must*
316 start the controller before the engines, unless you are manually specifying
397 start the controller before the engines, unless you are reusing connection
317 the ports on which to connect, in which case ordering is not important.
398 information (via `-r`), in which case ordering is not important.
318
399
319 .. note::
400 .. note::
320
401
321 On some platforms (OS X), to put the controller and engine into the
402 On some platforms (OS X), to put the controller and engine into the
322 background you may need to give these commands in the form ``(ipcontroller
403 background you may need to give these commands in the form ``(ipcontroller
323 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
404 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
324 properly.
405 properly.
325
406
326 Starting the controller and engines on different hosts
407 Starting the controller and engines on different hosts
327 ------------------------------------------------------
408 ------------------------------------------------------
328
409
329 When the controller and engines are running on different hosts, things are
410 When the controller and engines are running on different hosts, things are
330 slightly more complicated, but the underlying ideas are the same:
411 slightly more complicated, but the underlying ideas are the same:
331
412
332 1. Start the controller on a host using :command:`ipcontrollerz`.
413 1. Start the controller on a host using :command:`ipcontrollerz`.
333 2. Copy :file:`ipcontroller-engine.json` from :file:`~/.ipython/cluster_<profile>/security` on
414 2. Copy :file:`ipcontroller-engine.json` from :file:`~/.ipython/cluster_<profile>/security` on
334 the controller's host to the host where the engines will run.
415 the controller's host to the host where the engines will run.
335 3. Use :command:`ipenginez` on the engine's hosts to start the engines.
416 3. Use :command:`ipenginez` on the engine's hosts to start the engines.
336
417
337 The only thing you have to be careful of is to tell :command:`ipenginez` where
418 The only thing you have to be careful of is to tell :command:`ipenginez` where
338 the :file:`ipcontroller-engine.json` file is located. There are two ways you
419 the :file:`ipcontroller-engine.json` file is located. There are two ways you
339 can do this:
420 can do this:
340
421
341 * Put :file:`ipcontroller-engine.json` in the :file:`~/.ipython/cluster_<profile>/security`
422 * Put :file:`ipcontroller-engine.json` in the :file:`~/.ipython/cluster_<profile>/security`
342 directory on the engine's host, where it will be found automatically.
423 directory on the engine's host, where it will be found automatically.
343 * Call :command:`ipenginez` with the ``--file=full_path_to_the_file``
424 * Call :command:`ipenginez` with the ``--file=full_path_to_the_file``
344 flag.
425 flag.
345
426
346 The ``--file`` flag works like this::
427 The ``--file`` flag works like this::
347
428
348 $ ipengine --file=/path/to/my/ipcontroller-engine.json
429 $ ipengine --file=/path/to/my/ipcontroller-engine.json
349
430
350 .. note::
431 .. note::
351
432
352 If the controller's and engine's hosts all have a shared file system
433 If the controller's and engine's hosts all have a shared file system
353 (:file:`~/.ipython/cluster_<profile>/security` is the same on all of them), then things
434 (:file:`~/.ipython/cluster_<profile>/security` is the same on all of them), then things
354 will just work!
435 will just work!
355
436
356 Make JSON files persistent
437 Make JSON files persistent
357 ---------------------------
438 --------------------------
358
439
359 At fist glance it may seem that that managing the JSON files is a bit
440 At fist glance it may seem that that managing the JSON files is a bit
360 annoying. Going back to the house and key analogy, copying the JSON around
441 annoying. Going back to the house and key analogy, copying the JSON around
361 each time you start the controller is like having to make a new key every time
442 each time you start the controller is like having to make a new key every time
362 you want to unlock the door and enter your house. As with your house, you want
443 you want to unlock the door and enter your house. As with your house, you want
363 to be able to create the key (or JSON file) once, and then simply use it at
444 to be able to create the key (or JSON file) once, and then simply use it at
364 any point in the future.
445 any point in the future.
365
446
366 This is possible, but before you do this, you **must** remove any old JSON
447 To do this, the only thing you have to do is specify the `-r` flag, so that
367 files in the :file:`~/.ipython/cluster_<profile>/security` directory.
368
369 .. warning::
370
371 You **must** remove old JSON files before using persistent JSON files.
372
373 Then, the only thing you have to do is specify the registration port, so that
374 the connection information in the JSON files remains accurate::
448 the connection information in the JSON files remains accurate::
375
449
376 $ ipcontrollerz -r --regport 12345
450 $ ipcontrollerz -r --regport 12345
377
451
378
379 Then, just copy the JSON files over the first time and you are set. You can
452 Then, just copy the JSON files over the first time and you are set. You can
380 start and stop the controller and engines any many times as you want in the
453 start and stop the controller and engines any many times as you want in the
381 future, just make sure to tell the controller to use the *same* ports.
454 future, just make sure to tell the controller to reuse the file.
382
455
383 .. note::
456 .. note::
384
457
385 You may ask the question: what ports does the controller listen on if you
458 You may ask the question: what ports does the controller listen on if you
386 don't tell is to use specific ones? The default is to use high random port
459 don't tell is to use specific ones? The default is to use high random port
387 numbers. We do this for two reasons: i) to increase security through
460 numbers. We do this for two reasons: i) to increase security through
388 obscurity and ii) to multiple controllers on a given host to start and
461 obscurity and ii) to multiple controllers on a given host to start and
389 automatically use different ports.
462 automatically use different ports.
390
463
391 Log files
464 Log files
392 ---------
465 ---------
393
466
394 All of the components of IPython have log files associated with them.
467 All of the components of IPython have log files associated with them.
395 These log files can be extremely useful in debugging problems with
468 These log files can be extremely useful in debugging problems with
396 IPython and can be found in the directory :file:`~/.ipython/cluster_<profile>/log`.
469 IPython and can be found in the directory :file:`~/.ipython/cluster_<profile>/log`.
397 Sending the log files to us will often help us to debug any problems.
470 Sending the log files to us will often help us to debug any problems.
398
471
399
472
400 .. [PBS] Portable Batch System. http://www.openpbs.org/
473 .. [PBS] Portable Batch System. http://www.openpbs.org/
401 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
474 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
402
475
476 Configuring `ipcontrollerz`
477 ---------------------------
478
479 .. note::
480
481 TODO
482
483 Configuring `ipenginez`
484 -----------------------
485
486 .. note::
487
488 TODO
489
403
490
General Comments 0
You need to be logged in to leave comments. Login now