##// END OF EJS Templates
adjustments to PBS/SGE, SSH Launchers + docs update
MinRK -
Show More
@@ -1,192 +1,227 b''
1 1 import os
2 2
3 3 c = get_config()
4 4
5 5 #-----------------------------------------------------------------------------
6 6 # Select which launchers to use
7 7 #-----------------------------------------------------------------------------
8 8
9 9 # This allows you to control what method is used to start the controller
10 10 # and engines. The following methods are currently supported:
11 11 # - Start as a regular process on localhost.
12 12 # - Start using mpiexec.
13 13 # - Start using the Windows HPC Server 2008 scheduler
14 14 # - Start using PBS
15 15 # - Start using SSH
16 16
17 17
18 18 # The selected launchers can be configured below.
19 19
20 20 # Options are:
21 21 # - LocalControllerLauncher
22 22 # - MPIExecControllerLauncher
23 23 # - PBSControllerLauncher
24 24 # - WindowsHPCControllerLauncher
25 25 # c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.LocalControllerLauncher'
26 26
27 27 # Options are:
28 28 # - LocalEngineSetLauncher
29 29 # - MPIExecEngineSetLauncher
30 30 # - PBSEngineSetLauncher
31 31 # - WindowsHPCEngineSetLauncher
32 32 # c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.LocalEngineSetLauncher'
33 33
34 34 #-----------------------------------------------------------------------------
35 35 # Global configuration
36 36 #-----------------------------------------------------------------------------
37 37
38 38 # The default number of engines that will be started. This is overridden by
39 39 # the -n command line option: "ipcluster start -n 4"
40 40 # c.Global.n = 2
41 41
42 42 # Log to a file in cluster_dir/log, otherwise just log to sys.stdout.
43 43 # c.Global.log_to_file = False
44 44
45 45 # Remove old logs from cluster_dir/log before starting.
46 46 # c.Global.clean_logs = True
47 47
48 48 # The working directory for the process. The application will use os.chdir
49 49 # to change to this directory before starting.
50 50 # c.Global.work_dir = os.getcwd()
51 51
52 52
53 53 #-----------------------------------------------------------------------------
54 54 # Local process launchers
55 55 #-----------------------------------------------------------------------------
56 56
57 57 # The command line arguments to call the controller with.
58 58 # c.LocalControllerLauncher.controller_args = \
59 59 # ['--log-to-file','--log-level', '40']
60 60
61 61 # The working directory for the controller
62 62 # c.LocalEngineSetLauncher.work_dir = u''
63 63
64 64 # Command line argument passed to the engines.
65 65 # c.LocalEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
66 66
67 67 #-----------------------------------------------------------------------------
68 68 # MPIExec launchers
69 69 #-----------------------------------------------------------------------------
70 70
71 # The mpiexec/mpirun command to use in started the controller.
72 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
71 # The mpiexec/mpirun command to use in both the controller and engines.
72 # c.MPIExecLauncher.mpi_cmd = ['mpiexec']
73 73
74 74 # Additional arguments to pass to the actual mpiexec command.
75 # c.MPIExecLauncher.mpi_args = []
76
77 # The mpiexec/mpirun command and args can be overridden if they should be different
78 # for controller and engines.
79 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
75 80 # c.MPIExecControllerLauncher.mpi_args = []
81 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
82 # c.MPIExecEngineSetLauncher.mpi_args = []
76 83
77 84 # The command line argument to call the controller with.
78 85 # c.MPIExecControllerLauncher.controller_args = \
79 86 # ['--log-to-file','--log-level', '40']
80 87
81
82 # The mpiexec/mpirun command to use in started the controller.
83 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
84
85 # Additional arguments to pass to the actual mpiexec command.
86 # c.MPIExecEngineSetLauncher.mpi_args = []
87
88 88 # Command line argument passed to the engines.
89 89 # c.MPIExecEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
90 90
91 91 # The default number of engines to start if not given elsewhere.
92 92 # c.MPIExecEngineSetLauncher.n = 1
93 93
94 94 #-----------------------------------------------------------------------------
95 95 # SSH launchers
96 96 #-----------------------------------------------------------------------------
97 97
98 # Todo
98 # ipclusterz can be used to launch controller and engines remotely via ssh.
99 # Note that currently ipclusterz does not do any file distribution, so if
100 # machines are not on a shared filesystem, config and json files must be
101 # distributed. For this reason, the reuse_files defaults to True on an
102 # ssh-launched Controller. This flag can be overridded by the program_args
103 # attribute of c.SSHControllerLauncher.
104
105 # set the ssh cmd for launching remote commands. The default is ['ssh']
106 # c.SSHLauncher.ssh_cmd = ['ssh']
107
108 # set the ssh cmd for launching remote commands. The default is ['ssh']
109 # c.SSHLauncher.ssh_args = ['tt']
110
111 # Set the user and hostname for the controller
112 # c.SSHControllerLauncher.hostname = 'controller.example.com'
113 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
99 114
115 # Set the arguments to be passed to ipcontrollerz
116 # note that remotely launched ipcontrollerz will not get the contents of
117 # the local ipcontrollerz_config.py unless it resides on the *remote host*
118 # in the location specified by the --cluster_dir argument.
119 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
120
121 # Set the default args passed to ipenginez for SSH launched engines
122 # c.SSHEngineSetLauncher.engine_args = ['--mpi', 'mpi4py']
123
124 # SSH engines are launched as a dict of locations/n-engines.
125 # if a value is a tuple instead of an int, it is assumed to be of the form
126 # (n, [args]), setting the arguments to passed to ipenginez on `host`.
127 # otherwise, c.SSHEngineSetLauncher.engine_args will be used as the default.
128
129 # In this case, there will be 3 engines at my.example.com, and
130 # 2 at you@ipython.scipy.org with a special json connector location.
131 # c.SSHEngineSetLauncher.engines = {'my.example.com' : 3,
132 # 'you@ipython.scipy.org' : (2, ['-f', '/path/to/ipcontroller-engine.json']}
133 # }
100 134
101 135 #-----------------------------------------------------------------------------
102 136 # Unix batch (PBS) schedulers launchers
103 137 #-----------------------------------------------------------------------------
104 138
105 139 # The command line program to use to submit a PBS job.
106 # c.PBSControllerLauncher.submit_command = 'qsub'
140 # c.PBSControllerLauncher.submit_command = ['qsub']
107 141
108 142 # The command line program to use to delete a PBS job.
109 # c.PBSControllerLauncher.delete_command = 'qdel'
143 # c.PBSControllerLauncher.delete_command = ['qdel']
110 144
111 145 # A regular expression that takes the output of qsub and find the job id.
112 146 # c.PBSControllerLauncher.job_id_regexp = r'\d+'
113 147
114 148 # The batch submission script used to start the controller. This is where
115 # environment variables would be setup, etc. This string is interpolated using
149 # environment variables would be setup, etc. This string is interpreted using
116 150 # the Itpl module in IPython.external. Basically, you can use ${n} for the
117 151 # number of engine and ${cluster_dir} for the cluster_dir.
118 152 # c.PBSControllerLauncher.batch_template = """
119 # #PBS -l nprocs=$n
153 # #PBS -N ipcontroller
120 154 #
121 155 # ipcontrollerz --cluster-dir $cluster_dir
122 156 # """
123 157
124 158 # The name of the instantiated batch script that will actually be used to
125 159 # submit the job. This will be written to the cluster directory.
126 160 # c.PBSControllerLauncher.batch_file_name = u'pbs_batch_script_controller'
127 161
128 162
129 163 # The command line program to use to submit a PBS job.
130 164 # c.PBSEngineSetLauncher.submit_command = 'qsub'
131 165
132 166 # The command line program to use to delete a PBS job.
133 167 # c.PBSEngineSetLauncher.delete_command = 'qdel'
134 168
135 169 # A regular expression that takes the output of qsub and find the job id.
136 170 # c.PBSEngineSetLauncher.job_id_regexp = r'\d+'
137 171
138 172 # The batch submission script used to start the engines. This is where
139 # environment variables would be setup, etc. This string is interpolated using
173 # environment variables would be setup, etc. This string is interpreted using
140 174 # the Itpl module in IPython.external. Basically, you can use ${n} for the
141 175 # number of engine and ${cluster_dir} for the cluster_dir.
142 176 # c.PBSEngineSetLauncher.batch_template = """
177 # #PBS -N ipcontroller
143 178 # #PBS -l nprocs=$n
144 179 #
145 180 # ipenginez --cluster-dir $cluster_dir$s
146 181 # """
147 182
148 183 # The name of the instantiated batch script that will actually be used to
149 184 # submit the job. This will be written to the cluster directory.
150 185 # c.PBSEngineSetLauncher.batch_file_name = u'pbs_batch_script_engines'
151 186
152 187 #-----------------------------------------------------------------------------
153 188 # Windows HPC Server 2008 launcher configuration
154 189 #-----------------------------------------------------------------------------
155 190
156 191 # c.IPControllerJob.job_name = 'IPController'
157 192 # c.IPControllerJob.is_exclusive = False
158 193 # c.IPControllerJob.username = r'USERDOMAIN\USERNAME'
159 194 # c.IPControllerJob.priority = 'Highest'
160 195 # c.IPControllerJob.requested_nodes = ''
161 196 # c.IPControllerJob.project = 'MyProject'
162 197
163 198 # c.IPControllerTask.task_name = 'IPController'
164 199 # c.IPControllerTask.controller_cmd = [u'ipcontroller.exe']
165 200 # c.IPControllerTask.controller_args = ['--log-to-file', '--log-level', '40']
166 201 # c.IPControllerTask.environment_variables = {}
167 202
168 203 # c.WindowsHPCControllerLauncher.scheduler = 'HEADNODE'
169 204 # c.WindowsHPCControllerLauncher.job_file_name = u'ipcontroller_job.xml'
170 205
171 206
172 207 # c.IPEngineSetJob.job_name = 'IPEngineSet'
173 208 # c.IPEngineSetJob.is_exclusive = False
174 209 # c.IPEngineSetJob.username = r'USERDOMAIN\USERNAME'
175 210 # c.IPEngineSetJob.priority = 'Highest'
176 211 # c.IPEngineSetJob.requested_nodes = ''
177 212 # c.IPEngineSetJob.project = 'MyProject'
178 213
179 214 # c.IPEngineTask.task_name = 'IPEngine'
180 215 # c.IPEngineTask.engine_cmd = [u'ipengine.exe']
181 216 # c.IPEngineTask.engine_args = ['--log-to-file', '--log-level', '40']
182 217 # c.IPEngineTask.environment_variables = {}
183 218
184 219 # c.WindowsHPCEngineSetLauncher.scheduler = 'HEADNODE'
185 220 # c.WindowsHPCEngineSetLauncher.job_file_name = u'ipengineset_job.xml'
186 221
187 222
188 223
189 224
190 225
191 226
192 227
@@ -1,18 +1,18 b''
1 1 """The IPython ZMQ-based parallel computing interface."""
2 2 #-----------------------------------------------------------------------------
3 3 # Copyright (C) 2011 The IPython Development Team
4 4 #
5 5 # Distributed under the terms of the BSD License. The full license is in
6 6 # the file COPYING, distributed as part of this software.
7 7 #-----------------------------------------------------------------------------
8 8
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12
13 from .asyncresult import *
14 from .client import Client
15 from .dependency import *
16 from .remotefunction import *
17 from .view import *
13 # from .asyncresult import *
14 # from .client import Client
15 # from .dependency import *
16 # from .remotefunction import *
17 # from .view import *
18 18
@@ -1,847 +1,879 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 """
4 4 Facilities for launching IPython processes asynchronously.
5 5 """
6 6
7 7 #-----------------------------------------------------------------------------
8 8 # Copyright (C) 2008-2009 The IPython Development Team
9 9 #
10 10 # Distributed under the terms of the BSD License. The full license is in
11 11 # the file COPYING, distributed as part of this software.
12 12 #-----------------------------------------------------------------------------
13 13
14 14 #-----------------------------------------------------------------------------
15 15 # Imports
16 16 #-----------------------------------------------------------------------------
17 17
18 import copy
18 19 import logging
19 20 import os
20 21 import re
21 import sys
22 22
23 23 from signal import SIGINT, SIGTERM
24 24 try:
25 25 from signal import SIGKILL
26 26 except ImportError:
27 27 SIGKILL=SIGTERM
28 28
29 29 from subprocess import Popen, PIPE, STDOUT
30 30 try:
31 31 from subprocess import check_output
32 32 except ImportError:
33 # pre-2.7:
34 from StringIO import StringIO
35
33 # pre-2.7, define check_output with Popen
36 34 def check_output(*args, **kwargs):
37 sio = StringIO()
38 kwargs.update(dict(stdout=PIPE, stderr=STDOUT))
35 kwargs.update(dict(stdout=PIPE))
39 36 p = Popen(*args, **kwargs)
40 37 out,err = p.communicate()
41 38 return out
42 39
43 40 from zmq.eventloop import ioloop
44 41
45 42 from IPython.external import Itpl
46 43 # from IPython.config.configurable import Configurable
47 from IPython.utils.traitlets import Str, Int, List, Unicode, Dict, Instance
44 from IPython.utils.traitlets import Any, Str, Int, List, Unicode, Dict, Instance
48 45 from IPython.utils.path import get_ipython_module_path
49 46 from IPython.utils.process import find_cmd, pycmd2argv, FindCmdError
50 47
51 48 from .factory import LoggingFactory
52 49
53 50 # load winhpcjob from IPython.kernel
54 51 try:
55 52 from IPython.kernel.winhpcjob import (
56 53 IPControllerTask, IPEngineTask,
57 54 IPControllerJob, IPEngineSetJob
58 55 )
59 56 except ImportError:
60 57 pass
61 58
62 59
63 60 #-----------------------------------------------------------------------------
64 61 # Paths to the kernel apps
65 62 #-----------------------------------------------------------------------------
66 63
67 64
68 65 ipcluster_cmd_argv = pycmd2argv(get_ipython_module_path(
69 66 'IPython.zmq.parallel.ipclusterapp'
70 67 ))
71 68
72 69 ipengine_cmd_argv = pycmd2argv(get_ipython_module_path(
73 70 'IPython.zmq.parallel.ipengineapp'
74 71 ))
75 72
76 73 ipcontroller_cmd_argv = pycmd2argv(get_ipython_module_path(
77 74 'IPython.zmq.parallel.ipcontrollerapp'
78 75 ))
79 76
80 77 #-----------------------------------------------------------------------------
81 78 # Base launchers and errors
82 79 #-----------------------------------------------------------------------------
83 80
84 81
85 82 class LauncherError(Exception):
86 83 pass
87 84
88 85
89 86 class ProcessStateError(LauncherError):
90 87 pass
91 88
92 89
93 90 class UnknownStatus(LauncherError):
94 91 pass
95 92
96 93
97 94 class BaseLauncher(LoggingFactory):
98 95 """An asbtraction for starting, stopping and signaling a process."""
99 96
100 97 # In all of the launchers, the work_dir is where child processes will be
101 98 # run. This will usually be the cluster_dir, but may not be. any work_dir
102 99 # passed into the __init__ method will override the config value.
103 100 # This should not be used to set the work_dir for the actual engine
104 101 # and controller. Instead, use their own config files or the
105 102 # controller_args, engine_args attributes of the launchers to add
106 103 # the --work-dir option.
107 104 work_dir = Unicode(u'.')
108 105 loop = Instance('zmq.eventloop.ioloop.IOLoop')
106
107 start_data = Any()
108 stop_data = Any()
109
109 110 def _loop_default(self):
110 111 return ioloop.IOLoop.instance()
111 112
112 113 def __init__(self, work_dir=u'.', config=None, **kwargs):
113 114 super(BaseLauncher, self).__init__(work_dir=work_dir, config=config, **kwargs)
114 115 self.state = 'before' # can be before, running, after
115 116 self.stop_callbacks = []
116 117 self.start_data = None
117 118 self.stop_data = None
118 119
119 120 @property
120 121 def args(self):
121 122 """A list of cmd and args that will be used to start the process.
122 123
123 124 This is what is passed to :func:`spawnProcess` and the first element
124 125 will be the process name.
125 126 """
126 127 return self.find_args()
127 128
128 129 def find_args(self):
129 130 """The ``.args`` property calls this to find the args list.
130 131
131 132 Subcommand should implement this to construct the cmd and args.
132 133 """
133 134 raise NotImplementedError('find_args must be implemented in a subclass')
134 135
135 136 @property
136 137 def arg_str(self):
137 138 """The string form of the program arguments."""
138 139 return ' '.join(self.args)
139 140
140 141 @property
141 142 def running(self):
142 143 """Am I running."""
143 144 if self.state == 'running':
144 145 return True
145 146 else:
146 147 return False
147 148
148 149 def start(self):
149 150 """Start the process.
150 151
151 152 This must return a deferred that fires with information about the
152 153 process starting (like a pid, job id, etc.).
153 154 """
154 155 raise NotImplementedError('start must be implemented in a subclass')
155 156
156 157 def stop(self):
157 158 """Stop the process and notify observers of stopping.
158 159
159 160 This must return a deferred that fires with information about the
160 161 processing stopping, like errors that occur while the process is
161 162 attempting to be shut down. This deferred won't fire when the process
162 163 actually stops. To observe the actual process stopping, see
163 164 :func:`observe_stop`.
164 165 """
165 166 raise NotImplementedError('stop must be implemented in a subclass')
166 167
167 168 def on_stop(self, f):
168 169 """Get a deferred that will fire when the process stops.
169 170
170 171 The deferred will fire with data that contains information about
171 172 the exit status of the process.
172 173 """
173 174 if self.state=='after':
174 175 return f(self.stop_data)
175 176 else:
176 177 self.stop_callbacks.append(f)
177 178
178 179 def notify_start(self, data):
179 180 """Call this to trigger startup actions.
180 181
181 182 This logs the process startup and sets the state to 'running'. It is
182 183 a pass-through so it can be used as a callback.
183 184 """
184 185
185 186 self.log.info('Process %r started: %r' % (self.args[0], data))
186 187 self.start_data = data
187 188 self.state = 'running'
188 189 return data
189 190
190 191 def notify_stop(self, data):
191 192 """Call this to trigger process stop actions.
192 193
193 194 This logs the process stopping and sets the state to 'after'. Call
194 195 this to trigger all the deferreds from :func:`observe_stop`."""
195 196
196 197 self.log.info('Process %r stopped: %r' % (self.args[0], data))
197 198 self.stop_data = data
198 199 self.state = 'after'
199 200 for i in range(len(self.stop_callbacks)):
200 201 d = self.stop_callbacks.pop()
201 202 d(data)
202 203 return data
203 204
204 205 def signal(self, sig):
205 206 """Signal the process.
206 207
207 208 Return a semi-meaningless deferred after signaling the process.
208 209
209 210 Parameters
210 211 ----------
211 212 sig : str or int
212 213 'KILL', 'INT', etc., or any signal number
213 214 """
214 215 raise NotImplementedError('signal must be implemented in a subclass')
215 216
216 217
217 218 #-----------------------------------------------------------------------------
218 219 # Local process launchers
219 220 #-----------------------------------------------------------------------------
220 221
221 222
222 223 class LocalProcessLauncher(BaseLauncher):
223 224 """Start and stop an external process in an asynchronous manner.
224 225
225 226 This will launch the external process with a working directory of
226 227 ``self.work_dir``.
227 228 """
228 229
229 230 # This is used to to construct self.args, which is passed to
230 231 # spawnProcess.
231 232 cmd_and_args = List([])
232 233 poll_frequency = Int(100) # in ms
233 234
234 235 def __init__(self, work_dir=u'.', config=None, **kwargs):
235 236 super(LocalProcessLauncher, self).__init__(
236 237 work_dir=work_dir, config=config, **kwargs
237 238 )
238 239 self.process = None
239 240 self.start_deferred = None
240 241 self.poller = None
241 242
242 243 def find_args(self):
243 244 return self.cmd_and_args
244 245
245 246 def start(self):
246 247 if self.state == 'before':
247 248 self.process = Popen(self.args,
248 249 stdout=PIPE,stderr=PIPE,stdin=PIPE,
249 250 env=os.environ,
250 251 cwd=self.work_dir
251 252 )
252 253
253 254 self.loop.add_handler(self.process.stdout.fileno(), self.handle_stdout, self.loop.READ)
254 255 self.loop.add_handler(self.process.stderr.fileno(), self.handle_stderr, self.loop.READ)
255 256 self.poller = ioloop.PeriodicCallback(self.poll, self.poll_frequency, self.loop)
256 257 self.poller.start()
257 258 self.notify_start(self.process.pid)
258 259 else:
259 260 s = 'The process was already started and has state: %r' % self.state
260 261 raise ProcessStateError(s)
261 262
262 263 def stop(self):
263 264 return self.interrupt_then_kill()
264 265
265 266 def signal(self, sig):
266 267 if self.state == 'running':
267 268 self.process.send_signal(sig)
268 269
269 270 def interrupt_then_kill(self, delay=2.0):
270 271 """Send INT, wait a delay and then send KILL."""
271 272 self.signal(SIGINT)
272 273 self.killer = ioloop.DelayedCallback(lambda : self.signal(SIGKILL), delay*1000, self.loop)
273 274 self.killer.start()
274 275
275 276 # callbacks, etc:
276 277
277 278 def handle_stdout(self, fd, events):
278 279 line = self.process.stdout.readline()
279 280 # a stopped process will be readable but return empty strings
280 281 if line:
281 282 self.log.info(line[:-1])
282 283 else:
283 284 self.poll()
284 285
285 286 def handle_stderr(self, fd, events):
286 287 line = self.process.stderr.readline()
287 288 # a stopped process will be readable but return empty strings
288 289 if line:
289 290 self.log.error(line[:-1])
290 291 else:
291 292 self.poll()
292 293
293 294 def poll(self):
294 295 status = self.process.poll()
295 296 if status is not None:
296 297 self.poller.stop()
297 298 self.loop.remove_handler(self.process.stdout.fileno())
298 299 self.loop.remove_handler(self.process.stderr.fileno())
299 300 self.notify_stop(dict(exit_code=status, pid=self.process.pid))
300 301 return status
301 302
302 303 class LocalControllerLauncher(LocalProcessLauncher):
303 304 """Launch a controller as a regular external process."""
304 305
305 306 controller_cmd = List(ipcontroller_cmd_argv, config=True)
306 307 # Command line arguments to ipcontroller.
307 308 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
308 309
309 310 def find_args(self):
310 311 return self.controller_cmd + self.controller_args
311 312
312 313 def start(self, cluster_dir):
313 314 """Start the controller by cluster_dir."""
314 315 self.controller_args.extend(['--cluster-dir', cluster_dir])
315 316 self.cluster_dir = unicode(cluster_dir)
316 317 self.log.info("Starting LocalControllerLauncher: %r" % self.args)
317 318 return super(LocalControllerLauncher, self).start()
318 319
319 320
320 321 class LocalEngineLauncher(LocalProcessLauncher):
321 322 """Launch a single engine as a regular externall process."""
322 323
323 324 engine_cmd = List(ipengine_cmd_argv, config=True)
324 325 # Command line arguments for ipengine.
325 326 engine_args = List(
326 327 ['--log-to-file','--log-level', str(logging.INFO)], config=True
327 328 )
328 329
329 330 def find_args(self):
330 331 return self.engine_cmd + self.engine_args
331 332
332 333 def start(self, cluster_dir):
333 334 """Start the engine by cluster_dir."""
334 335 self.engine_args.extend(['--cluster-dir', cluster_dir])
335 336 self.cluster_dir = unicode(cluster_dir)
336 337 return super(LocalEngineLauncher, self).start()
337 338
338 339
339 340 class LocalEngineSetLauncher(BaseLauncher):
340 341 """Launch a set of engines as regular external processes."""
341 342
342 343 # Command line arguments for ipengine.
343 344 engine_args = List(
344 345 ['--log-to-file','--log-level', str(logging.INFO)], config=True
345 346 )
346 347 # launcher class
347 348 launcher_class = LocalEngineLauncher
348 349
350 launchers = Dict()
351 stop_data = Dict()
352
349 353 def __init__(self, work_dir=u'.', config=None, **kwargs):
350 354 super(LocalEngineSetLauncher, self).__init__(
351 355 work_dir=work_dir, config=config, **kwargs
352 356 )
353 self.launchers = {}
354 357 self.stop_data = {}
355 358
356 359 def start(self, n, cluster_dir):
357 360 """Start n engines by profile or cluster_dir."""
358 361 self.cluster_dir = unicode(cluster_dir)
359 362 dlist = []
360 363 for i in range(n):
361 364 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
362 365 # Copy the engine args over to each engine launcher.
363 import copy
364 366 el.engine_args = copy.deepcopy(self.engine_args)
365 367 el.on_stop(self._notice_engine_stopped)
366 368 d = el.start(cluster_dir)
367 369 if i==0:
368 370 self.log.info("Starting LocalEngineSetLauncher: %r" % el.args)
369 371 self.launchers[i] = el
370 372 dlist.append(d)
371 373 self.notify_start(dlist)
372 374 # The consumeErrors here could be dangerous
373 375 # dfinal = gatherBoth(dlist, consumeErrors=True)
374 376 # dfinal.addCallback(self.notify_start)
375 377 return dlist
376 378
377 379 def find_args(self):
378 380 return ['engine set']
379 381
380 382 def signal(self, sig):
381 383 dlist = []
382 384 for el in self.launchers.itervalues():
383 385 d = el.signal(sig)
384 386 dlist.append(d)
385 387 # dfinal = gatherBoth(dlist, consumeErrors=True)
386 388 return dlist
387 389
388 390 def interrupt_then_kill(self, delay=1.0):
389 391 dlist = []
390 392 for el in self.launchers.itervalues():
391 393 d = el.interrupt_then_kill(delay)
392 394 dlist.append(d)
393 395 # dfinal = gatherBoth(dlist, consumeErrors=True)
394 396 return dlist
395 397
396 398 def stop(self):
397 399 return self.interrupt_then_kill()
398 400
399 401 def _notice_engine_stopped(self, data):
400 print "notice", data
401 402 pid = data['pid']
402 403 for idx,el in self.launchers.iteritems():
403 404 if el.process.pid == pid:
404 405 break
405 406 self.launchers.pop(idx)
406 407 self.stop_data[idx] = data
407 408 if not self.launchers:
408 409 self.notify_stop(self.stop_data)
409 410
410 411
411 412 #-----------------------------------------------------------------------------
412 413 # MPIExec launchers
413 414 #-----------------------------------------------------------------------------
414 415
415 416
416 417 class MPIExecLauncher(LocalProcessLauncher):
417 418 """Launch an external process using mpiexec."""
418 419
419 420 # The mpiexec command to use in starting the process.
420 421 mpi_cmd = List(['mpiexec'], config=True)
421 422 # The command line arguments to pass to mpiexec.
422 423 mpi_args = List([], config=True)
423 424 # The program to start using mpiexec.
424 425 program = List(['date'], config=True)
425 426 # The command line argument to the program.
426 427 program_args = List([], config=True)
427 428 # The number of instances of the program to start.
428 429 n = Int(1, config=True)
429 430
430 431 def find_args(self):
431 432 """Build self.args using all the fields."""
432 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
433 return self.mpi_cmd + ['-n', str(self.n)] + self.mpi_args + \
433 434 self.program + self.program_args
434 435
435 436 def start(self, n):
436 437 """Start n instances of the program using mpiexec."""
437 438 self.n = n
438 439 return super(MPIExecLauncher, self).start()
439 440
440 441
441 442 class MPIExecControllerLauncher(MPIExecLauncher):
442 443 """Launch a controller using mpiexec."""
443 444
444 445 controller_cmd = List(ipcontroller_cmd_argv, config=True)
445 446 # Command line arguments to ipcontroller.
446 447 controller_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
447 448 n = Int(1, config=False)
448 449
449 450 def start(self, cluster_dir):
450 451 """Start the controller by cluster_dir."""
451 452 self.controller_args.extend(['--cluster-dir', cluster_dir])
452 453 self.cluster_dir = unicode(cluster_dir)
453 454 self.log.info("Starting MPIExecControllerLauncher: %r" % self.args)
454 455 return super(MPIExecControllerLauncher, self).start(1)
455 456
456 457 def find_args(self):
457 458 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
458 459 self.controller_cmd + self.controller_args
459 460
460 461
461 462 class MPIExecEngineSetLauncher(MPIExecLauncher):
462 463
463 engine_cmd = List(ipengine_cmd_argv, config=True)
464 program = List(ipengine_cmd_argv, config=True)
464 465 # Command line arguments for ipengine.
465 engine_args = List(
466 program_args = List(
466 467 ['--log-to-file','--log-level', str(logging.INFO)], config=True
467 468 )
468 469 n = Int(1, config=True)
469 470
470 471 def start(self, n, cluster_dir):
471 472 """Start n engines by profile or cluster_dir."""
472 self.engine_args.extend(['--cluster-dir', cluster_dir])
473 self.program_args.extend(['--cluster-dir', cluster_dir])
473 474 self.cluster_dir = unicode(cluster_dir)
474 475 self.n = n
475 476 self.log.info('Starting MPIExecEngineSetLauncher: %r' % self.args)
476 477 return super(MPIExecEngineSetLauncher, self).start(n)
477 478
478 def find_args(self):
479 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
480 self.engine_cmd + self.engine_args
481
482
483 479 #-----------------------------------------------------------------------------
484 480 # SSH launchers
485 481 #-----------------------------------------------------------------------------
486 482
487 483 # TODO: Get SSH Launcher working again.
488 484
489 485 class SSHLauncher(LocalProcessLauncher):
490 486 """A minimal launcher for ssh.
491 487
492 488 To be useful this will probably have to be extended to use the ``sshx``
493 489 idea for environment variables. There could be other things this needs
494 490 as well.
495 491 """
496 492
497 493 ssh_cmd = List(['ssh'], config=True)
498 494 ssh_args = List(['-tt'], config=True)
499 495 program = List(['date'], config=True)
500 496 program_args = List([], config=True)
501 497 hostname = Str('', config=True)
502 user = Str(os.environ.get('USER','username'), config=True)
498 user = Str('', config=True)
503 499 location = Str('')
504 500
505 501 def _hostname_changed(self, name, old, new):
506 self.location = '%s@%s' % (self.user, new)
502 if self.user:
503 self.location = '%s@%s' % (self.user, new)
504 else:
505 self.location = new
507 506
508 507 def _user_changed(self, name, old, new):
509 508 self.location = '%s@%s' % (new, self.hostname)
510 509
511 510 def find_args(self):
512 511 return self.ssh_cmd + self.ssh_args + [self.location] + \
513 512 self.program + self.program_args
514 513
515 514 def start(self, cluster_dir, hostname=None, user=None):
516 print self.config
515 self.cluster_dir = unicode(cluster_dir)
517 516 if hostname is not None:
518 517 self.hostname = hostname
519 518 if user is not None:
520 519 self.user = user
521 print (self.location, hostname, user)
520
522 521 return super(SSHLauncher, self).start()
523 522
524 523 def signal(self, sig):
525 524 if self.state == 'running':
526 525 # send escaped ssh connection-closer
527 526 self.process.stdin.write('~.')
528 527 self.process.stdin.flush()
529 528
530 529
531 530
532 531 class SSHControllerLauncher(SSHLauncher):
533 532
534 533 program = List(ipcontroller_cmd_argv, config=True)
535 534 # Command line arguments to ipcontroller.
536 program_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
535 program_args = List(['-r', '--log-to-file','--log-level', str(logging.INFO)], config=True)
537 536
538 537
539 538 class SSHEngineLauncher(SSHLauncher):
540 539 program = List(ipengine_cmd_argv, config=True)
541 540 # Command line arguments for ipengine.
542 541 program_args = List(
543 542 ['--log-to-file','--log-level', str(logging.INFO)], config=True
544 543 )
545 544
546 545 class SSHEngineSetLauncher(LocalEngineSetLauncher):
547 546 launcher_class = SSHEngineLauncher
547 engines = Dict(config=True)
548
549 def start(self, n, cluster_dir):
550 """Start engines by profile or cluster_dir.
551 `n` is ignored, and the `engines` config property is used instead.
552 """
553
554 self.cluster_dir = unicode(cluster_dir)
555 dlist = []
556 for host, n in self.engines.iteritems():
557 if isinstance(n, (tuple, list)):
558 n, args = n
559 else:
560 args = copy.deepcopy(self.engine_args)
561
562 if '@' in host:
563 user,host = host.split('@',1)
564 else:
565 user=None
566 for i in range(n):
567 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
568
569 # Copy the engine args over to each engine launcher.
570 i
571 el.program_args = args
572 el.on_stop(self._notice_engine_stopped)
573 d = el.start(cluster_dir, user=user, hostname=host)
574 if i==0:
575 self.log.info("Starting SSHEngineSetLauncher: %r" % el.args)
576 self.launchers[host+str(i)] = el
577 dlist.append(d)
578 self.notify_start(dlist)
579 return dlist
580
548 581
549 582
550 583 #-----------------------------------------------------------------------------
551 584 # Windows HPC Server 2008 scheduler launchers
552 585 #-----------------------------------------------------------------------------
553 586
554 587
555 588 # This is only used on Windows.
556 589 def find_job_cmd():
557 590 if os.name=='nt':
558 591 try:
559 592 return find_cmd('job')
560 593 except FindCmdError:
561 594 return 'job'
562 595 else:
563 596 return 'job'
564 597
565 598
566 599 class WindowsHPCLauncher(BaseLauncher):
567 600
568 601 # A regular expression used to get the job id from the output of the
569 602 # submit_command.
570 603 job_id_regexp = Str(r'\d+', config=True)
571 604 # The filename of the instantiated job script.
572 605 job_file_name = Unicode(u'ipython_job.xml', config=True)
573 606 # The full path to the instantiated job script. This gets made dynamically
574 607 # by combining the work_dir with the job_file_name.
575 608 job_file = Unicode(u'')
576 609 # The hostname of the scheduler to submit the job to
577 610 scheduler = Str('', config=True)
578 611 job_cmd = Str(find_job_cmd(), config=True)
579 612
580 613 def __init__(self, work_dir=u'.', config=None, **kwargs):
581 614 super(WindowsHPCLauncher, self).__init__(
582 615 work_dir=work_dir, config=config, **kwargs
583 616 )
584 617
585 618 @property
586 619 def job_file(self):
587 620 return os.path.join(self.work_dir, self.job_file_name)
588 621
589 622 def write_job_file(self, n):
590 623 raise NotImplementedError("Implement write_job_file in a subclass.")
591 624
592 625 def find_args(self):
593 626 return ['job.exe']
594 627
595 628 def parse_job_id(self, output):
596 629 """Take the output of the submit command and return the job id."""
597 630 m = re.search(self.job_id_regexp, output)
598 631 if m is not None:
599 632 job_id = m.group()
600 633 else:
601 634 raise LauncherError("Job id couldn't be determined: %s" % output)
602 635 self.job_id = job_id
603 636 self.log.info('Job started with job id: %r' % job_id)
604 637 return job_id
605 638
606 639 def start(self, n):
607 640 """Start n copies of the process using the Win HPC job scheduler."""
608 641 self.write_job_file(n)
609 642 args = [
610 643 'submit',
611 644 '/jobfile:%s' % self.job_file,
612 645 '/scheduler:%s' % self.scheduler
613 646 ]
614 647 self.log.info("Starting Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
615 648 # Twisted will raise DeprecationWarnings if we try to pass unicode to this
616 649 output = check_output([self.job_cmd]+args,
617 650 env=os.environ,
618 651 cwd=self.work_dir,
619 652 stderr=STDOUT
620 653 )
621 654 job_id = self.parse_job_id(output)
622 # self.notify_start(job_id)
655 self.notify_start(job_id)
623 656 return job_id
624 657
625 658 def stop(self):
626 659 args = [
627 660 'cancel',
628 661 self.job_id,
629 662 '/scheduler:%s' % self.scheduler
630 663 ]
631 664 self.log.info("Stopping Win HPC Job: %s" % (self.job_cmd + ' ' + ' '.join(args),))
632 665 try:
633 666 output = check_output([self.job_cmd]+args,
634 667 env=os.environ,
635 668 cwd=self.work_dir,
636 669 stderr=STDOUT
637 670 )
638 671 except:
639 672 output = 'The job already appears to be stoppped: %r' % self.job_id
640 self.notify_stop(output) # Pass the output of the kill cmd
673 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
641 674 return output
642 675
643 676
644 677 class WindowsHPCControllerLauncher(WindowsHPCLauncher):
645 678
646 679 job_file_name = Unicode(u'ipcontroller_job.xml', config=True)
647 680 extra_args = List([], config=False)
648 681
649 682 def write_job_file(self, n):
650 683 job = IPControllerJob(config=self.config)
651 684
652 685 t = IPControllerTask(config=self.config)
653 686 # The tasks work directory is *not* the actual work directory of
654 687 # the controller. It is used as the base path for the stdout/stderr
655 688 # files that the scheduler redirects to.
656 689 t.work_directory = self.cluster_dir
657 690 # Add the --cluster-dir and from self.start().
658 691 t.controller_args.extend(self.extra_args)
659 692 job.add_task(t)
660 693
661 694 self.log.info("Writing job description file: %s" % self.job_file)
662 695 job.write(self.job_file)
663 696
664 697 @property
665 698 def job_file(self):
666 699 return os.path.join(self.cluster_dir, self.job_file_name)
667 700
668 701 def start(self, cluster_dir):
669 702 """Start the controller by cluster_dir."""
670 703 self.extra_args = ['--cluster-dir', cluster_dir]
671 704 self.cluster_dir = unicode(cluster_dir)
672 705 return super(WindowsHPCControllerLauncher, self).start(1)
673 706
674 707
675 708 class WindowsHPCEngineSetLauncher(WindowsHPCLauncher):
676 709
677 710 job_file_name = Unicode(u'ipengineset_job.xml', config=True)
678 711 extra_args = List([], config=False)
679 712
680 713 def write_job_file(self, n):
681 714 job = IPEngineSetJob(config=self.config)
682 715
683 716 for i in range(n):
684 717 t = IPEngineTask(config=self.config)
685 718 # The tasks work directory is *not* the actual work directory of
686 719 # the engine. It is used as the base path for the stdout/stderr
687 720 # files that the scheduler redirects to.
688 721 t.work_directory = self.cluster_dir
689 722 # Add the --cluster-dir and from self.start().
690 723 t.engine_args.extend(self.extra_args)
691 724 job.add_task(t)
692 725
693 726 self.log.info("Writing job description file: %s" % self.job_file)
694 727 job.write(self.job_file)
695 728
696 729 @property
697 730 def job_file(self):
698 731 return os.path.join(self.cluster_dir, self.job_file_name)
699 732
700 733 def start(self, n, cluster_dir):
701 734 """Start the controller by cluster_dir."""
702 735 self.extra_args = ['--cluster-dir', cluster_dir]
703 736 self.cluster_dir = unicode(cluster_dir)
704 737 return super(WindowsHPCEngineSetLauncher, self).start(n)
705 738
706 739
707 740 #-----------------------------------------------------------------------------
708 741 # Batch (PBS) system launchers
709 742 #-----------------------------------------------------------------------------
710 743
711 # TODO: Get PBS launcher working again.
712
713 744 class BatchSystemLauncher(BaseLauncher):
714 745 """Launch an external process using a batch system.
715 746
716 747 This class is designed to work with UNIX batch systems like PBS, LSF,
717 748 GridEngine, etc. The overall model is that there are different commands
718 749 like qsub, qdel, etc. that handle the starting and stopping of the process.
719 750
720 751 This class also has the notion of a batch script. The ``batch_template``
721 752 attribute can be set to a string that is a template for the batch script.
722 753 This template is instantiated using Itpl. Thus the template can use
723 754 ${n} fot the number of instances. Subclasses can add additional variables
724 755 to the template dict.
725 756 """
726 757
727 758 # Subclasses must fill these in. See PBSEngineSet
728 759 # The name of the command line program used to submit jobs.
729 760 submit_command = Str('', config=True)
730 761 # The name of the command line program used to delete jobs.
731 762 delete_command = Str('', config=True)
732 763 # A regular expression used to get the job id from the output of the
733 764 # submit_command.
734 765 job_id_regexp = Str('', config=True)
735 766 # The string that is the batch script template itself.
736 767 batch_template = Str('', config=True)
737 768 # The filename of the instantiated batch script.
738 769 batch_file_name = Unicode(u'batch_script', config=True)
739 770 # The full path to the instantiated batch script.
740 771 batch_file = Unicode(u'')
741 772 # the format dict used with batch_template:
742 773 context = Dict()
743 774
744 775
745 776 def find_args(self):
746 return [self.submit_command]
777 return [self.submit_command, self.batch_file]
747 778
748 779 def __init__(self, work_dir=u'.', config=None, **kwargs):
749 780 super(BatchSystemLauncher, self).__init__(
750 781 work_dir=work_dir, config=config, **kwargs
751 782 )
752 783 self.batch_file = os.path.join(self.work_dir, self.batch_file_name)
753 784
754 785 def parse_job_id(self, output):
755 786 """Take the output of the submit command and return the job id."""
756 m = re.match(self.job_id_regexp, output)
787 m = re.search(self.job_id_regexp, output)
757 788 if m is not None:
758 789 job_id = m.group()
759 790 else:
760 791 raise LauncherError("Job id couldn't be determined: %s" % output)
761 792 self.job_id = job_id
762 self.log.info('Job started with job id: %r' % job_id)
793 self.log.info('Job submitted with job id: %r' % job_id)
763 794 return job_id
764 795
765 796 def write_batch_script(self, n):
766 797 """Instantiate and write the batch script to the work_dir."""
767 798 self.context['n'] = n
768 799 script_as_string = Itpl.itplns(self.batch_template, self.context)
769 800 self.log.info('Writing instantiated batch script: %s' % self.batch_file)
770 801 f = open(self.batch_file, 'w')
771 802 f.write(script_as_string)
772 803 f.close()
773 804
774 805 def start(self, n, cluster_dir):
775 806 """Start n copies of the process using a batch system."""
776 807 # Here we save profile and cluster_dir in the context so they
777 808 # can be used in the batch script template as ${profile} and
778 809 # ${cluster_dir}
779 810 self.context['cluster_dir'] = cluster_dir
780 811 self.cluster_dir = unicode(cluster_dir)
781 812 self.write_batch_script(n)
782 output = check_output([self.submit_command, self.batch_file], env=os.environ, stdout=STDOUT)
813 output = check_output(self.args, env=os.environ)
814
783 815 job_id = self.parse_job_id(output)
784 # self.notify_start(job_id)
816 self.notify_start(job_id)
785 817 return job_id
786 818
787 819 def stop(self):
788 output = check_output([self.delete_command, self.job_id], env=os.environ, stderr=STDOUT)
789 self.notify_stop(output) # Pass the output of the kill cmd
820 output = check_output([self.delete_command, self.job_id], env=os.environ)
821 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
790 822 return output
791 823
792 824
793 825 class PBSLauncher(BatchSystemLauncher):
794 826 """A BatchSystemLauncher subclass for PBS."""
795 827
796 828 submit_command = Str('qsub', config=True)
797 829 delete_command = Str('qdel', config=True)
798 830 job_id_regexp = Str(r'\d+', config=True)
799 831 batch_template = Str('', config=True)
800 832 batch_file_name = Unicode(u'pbs_batch_script', config=True)
801 833 batch_file = Unicode(u'')
802 834
803 835
804 836 class PBSControllerLauncher(PBSLauncher):
805 837 """Launch a controller using PBS."""
806 838
807 839 batch_file_name = Unicode(u'pbs_batch_script_controller', config=True)
808 840
809 841 def start(self, cluster_dir):
810 842 """Start the controller by profile or cluster_dir."""
811 843 self.log.info("Starting PBSControllerLauncher: %r" % self.args)
812 844 return super(PBSControllerLauncher, self).start(1, cluster_dir)
813 845
814 846
815 847 class PBSEngineSetLauncher(PBSLauncher):
816 848
817 849 batch_file_name = Unicode(u'pbs_batch_script_engines', config=True)
818 850
819 851 def start(self, n, cluster_dir):
820 852 """Start n engines by profile or cluster_dir."""
821 853 self.log.info('Starting PBSEngineSetLauncher: %r' % self.args)
822 854 return super(PBSEngineSetLauncher, self).start(n, cluster_dir)
823 855
824 856
825 857 #-----------------------------------------------------------------------------
826 858 # A launcher for ipcluster itself!
827 859 #-----------------------------------------------------------------------------
828 860
829 861
830 862 class IPClusterLauncher(LocalProcessLauncher):
831 863 """Launch the ipcluster program in an external process."""
832 864
833 865 ipcluster_cmd = List(ipcluster_cmd_argv, config=True)
834 866 # Command line arguments to pass to ipcluster.
835 867 ipcluster_args = List(
836 868 ['--clean-logs', '--log-to-file', '--log-level', str(logging.INFO)], config=True)
837 869 ipcluster_subcommand = Str('start')
838 870 ipcluster_n = Int(2)
839 871
840 872 def find_args(self):
841 873 return self.ipcluster_cmd + [self.ipcluster_subcommand] + \
842 874 ['-n', repr(self.ipcluster_n)] + self.ipcluster_args
843 875
844 876 def start(self):
845 877 self.log.info("Starting ipcluster: %r" % self.args)
846 878 return super(IPClusterLauncher, self).start()
847 879
@@ -1,403 +1,490 b''
1 1 .. _parallel_process:
2 2
3 3 ===========================================
4 4 Starting the IPython controller and engines
5 5 ===========================================
6 6
7 7 To use IPython for parallel computing, you need to start one instance of
8 8 the controller and one or more instances of the engine. The controller
9 9 and each engine can run on different machines or on the same machine.
10 10 Because of this, there are many different possibilities.
11 11
12 12 Broadly speaking, there are two ways of going about starting a controller and engines:
13 13
14 14 * In an automated manner using the :command:`ipclusterz` command.
15 15 * In a more manual way using the :command:`ipcontrollerz` and
16 16 :command:`ipenginez` commands.
17 17
18 18 This document describes both of these methods. We recommend that new users
19 19 start with the :command:`ipclusterz` command as it simplifies many common usage
20 20 cases.
21 21
22 22 General considerations
23 23 ======================
24 24
25 25 Before delving into the details about how you can start a controller and
26 26 engines using the various methods, we outline some of the general issues that
27 27 come up when starting the controller and engines. These things come up no
28 28 matter which method you use to start your IPython cluster.
29 29
30 30 Let's say that you want to start the controller on ``host0`` and engines on
31 31 hosts ``host1``-``hostn``. The following steps are then required:
32 32
33 33 1. Start the controller on ``host0`` by running :command:`ipcontrollerz` on
34 34 ``host0``.
35 35 2. Move the JSON file (:file:`ipcontroller-engine.json`) created by the
36 36 controller from ``host0`` to hosts ``host1``-``hostn``.
37 37 3. Start the engines on hosts ``host1``-``hostn`` by running
38 38 :command:`ipenginez`. This command has to be told where the JSON file
39 39 (:file:`ipcontroller-engine.json`) is located.
40 40
41 41 At this point, the controller and engines will be connected. By default, the JSON files
42 42 created by the controller are put into the :file:`~/.ipython/clusterz_default/security`
43 43 directory. If the engines share a filesystem with the controller, step 2 can be skipped as
44 44 the engines will automatically look at that location.
45 45
46 46 The final step required to actually use the running controller from a client is to move
47 47 the JSON file :file:`ipcontroller-client.json` from ``host0`` to any host where clients
48 48 will be run. If these file are put into the :file:`~/.ipython/clusterz_default/security`
49 49 directory of the client's host, they will be found automatically. Otherwise, the full path
50 50 to them has to be passed to the client's constructor.
51 51
52 52 Using :command:`ipclusterz`
53 53 ==========================
54 54
55 55 The :command:`ipclusterz` command provides a simple way of starting a
56 56 controller and engines in the following situations:
57 57
58 58 1. When the controller and engines are all run on localhost. This is useful
59 59 for testing or running on a multicore computer.
60 60 2. When engines are started using the :command:`mpirun` command that comes
61 61 with most MPI [MPI]_ implementations
62 3. When engines are started using the PBS [PBS]_ batch system.
62 3. When engines are started using the PBS [PBS]_ batch system
63 (or other `qsub` systems, such as SGE).
63 64 4. When the controller is started on localhost and the engines are started on
64 65 remote nodes using :command:`ssh`.
65
66 .. note::
67
68 It is also possible for advanced users to add support to
69 :command:`ipclusterz` for starting controllers and engines using other
70 methods (like Sun's Grid Engine for example).
66 5. When engines are started using the Windows HPC Server batch system.
71 67
72 68 .. note::
73 69
74 70 Currently :command:`ipclusterz` requires that the
75 71 :file:`~/.ipython/cluster_<profile>/security` directory live on a shared filesystem that is
76 72 seen by both the controller and engines. If you don't have a shared file
77 73 system you will need to use :command:`ipcontrollerz` and
78 :command:`ipenginez` directly. This constraint can be relaxed if you are
79 using the :command:`ssh` method to start the cluster.
74 :command:`ipenginez` directly.
80 75
81 76 Under the hood, :command:`ipclusterz` just uses :command:`ipcontrollerz`
82 77 and :command:`ipenginez` to perform the steps described above.
83 78
84 Using :command:`ipclusterz` in local mode
85 ----------------------------------------
86
87 To start one controller and 4 engines on localhost, just do::
79 The simplest way to use ipclusterz requires no configuration, and will
80 launch a controller and a number of engines on the local machine. For instance,
81 to start one controller and 4 engines on localhost, just do::
88 82
89 83 $ ipclusterz start -n 4
90 84
91 85 To see other command line options for the local mode, do::
92 86
93 87 $ ipclusterz -h
94 88
95 .. note::
96 89
97 The remainder of this section refers to the 0.10 clusterfile model, no longer in use.
98 skip to
90 Configuring an IPython cluster
91 ==============================
99 92
100 Using :command:`ipclusterz` in mpiexec/mpirun mode
101 -------------------------------------------------
93 Cluster configurations are stored as `profiles`. You can create a new profile with::
94
95 $ ipclusterz create -p myprofile
96
97 This will create the directory :file:`IPYTHONDIR/clusterz_myprofile`, and populate it
98 with the default configuration files for the three IPython cluster commands. Once
99 you edit those files, you can continue to call ipclusterz/ipcontrollerz/ipenginez
100 with no arguments beyond ``-p myprofile``, and any configuration will be maintained.
101
102 There is no limit to the number of profiles you can have, so you can maintain a profile for each
103 of your common use cases. The default profile will be used whenever the
104 profile argument is not specified, so edit :file:`IPYTHONDIR/clusterz_default/*_config.py` to
105 represent your most common use case.
106
107 The configuration files are loaded with commented-out settings and explanations,
108 which should cover most of the available possibilities.
109
110 Using various batch systems with :command:`ipclusterz`
111 ------------------------------------------------------
112
113 :command:`ipclusterz` has a notion of Launchers that can start controllers
114 and engines with various remote execution schemes. Currently supported
115 models include `mpiexec`, PBS-style (Torque, SGE), and Windows HPC Server.
102 116
103 117 .. note::
104 118
105 This section is out of date for IPython 0.11
119 The Launchers and configuration are designed in such a way that advanced
120 users can subclass and configure them to fit their own system that we
121 have not yet supported (such as Condor)
122
123 Using :command:`ipclusterz` in mpiexec/mpirun mode
124 --------------------------------------------------
106 125
107 126
108 127 The mpiexec/mpirun mode is useful if you:
109 128
110 129 1. Have MPI installed.
111 130 2. Your systems are configured to use the :command:`mpiexec` or
112 131 :command:`mpirun` commands to start MPI processes.
113 132
114 .. note::
133 If these are satisfied, you can create a new profile::
134
135 $ ipclusterz create -p mpi
136
137 and edit the file :file:`IPYTHONDIR/clusterz_mpi/ipclusterz_config.py`.
115 138
116 The preferred command to use is :command:`mpiexec`. However, we also
117 support :command:`mpirun` for backwards compatibility. The underlying
118 logic used is exactly the same, the only difference being the name of the
119 command line program that is called.
139 There, instruct ipclusterz to use the MPIExec launchers by adding the lines:
120 140
121 If these are satisfied, you can start an IPython cluster using::
141 .. sourcecode:: python
142
143 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.MPIExecEngineSetLauncher'
144
145 If the default MPI configuration is correct, then you can now start your cluster, with::
122 146
123 $ ipclusterz mpiexec -n 4
147 $ ipclusterz start -n 4 -p mpi
124 148
125 149 This does the following:
126 150
127 151 1. Starts the IPython controller on current host.
128 152 2. Uses :command:`mpiexec` to start 4 engines.
129 153
154 If you have a reason to also start the Controller with mpi, you can specify:
155
156 .. sourcecode:: python
157
158 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.MPIExecControllerLauncher'
159
160 .. note::
161
162 The Controller *will not* be in the same MPI universe as the engines, so there is not
163 much reason to do this unless sysadmins demand it.
164
130 165 On newer MPI implementations (such as OpenMPI), this will work even if you
131 166 don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI
132 167 implementations actually require each process to call :func:`MPI_Init` upon
133 168 starting. The easiest way of having this done is to install the mpi4py
134 [mpi4py]_ package and then call ipclusterz with the ``--mpi`` option::
169 [mpi4py]_ package and then specify the ``c.MPI.use`` option in :file:`ipenginez_config.py`:
170
171 .. sourcecode:: python
135 172
136 $ ipclusterz mpiexec -n 4 --mpi=mpi4py
173 c.MPI.use = 'mpi4py'
137 174
138 175 Unfortunately, even this won't work for some MPI implementations. If you are
139 176 having problems with this, you will likely have to use a custom Python
140 177 executable that itself calls :func:`MPI_Init` at the appropriate time.
141 178 Fortunately, mpi4py comes with such a custom Python executable that is easy to
142 179 install and use. However, this custom Python executable approach will not work
143 180 with :command:`ipclusterz` currently.
144 181
145 Additional command line options for this mode can be found by doing::
146
147 $ ipclusterz mpiexec -h
148
149 182 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
150 183
151 184
152 185 Using :command:`ipclusterz` in PBS mode
153 --------------------------------------
186 ---------------------------------------
154 187
155 .. note::
188 The PBS mode uses the Portable Batch System [PBS]_ to start the engines.
189
190 As usual, we will start by creating a fresh profile::
191
192 $ ipclusterz create -p pbs
193
194 And in :file:`ipclusterz_config.py`, we will select the PBS launchers for the controller
195 and engines:
156 196
157 This section is out of date for IPython 0.11
197 .. sourcecode:: python
158 198
199 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.PBSControllerLauncher'
200 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
159 201
160 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To
161 use this mode, you first need to create a PBS script template that will be
202 To use this mode, you first need to create a PBS script template that will be
162 203 used to start the engines. Here is a sample PBS script template:
163 204
164 205 .. sourcecode:: bash
165 206
166 207 #PBS -N ipython
167 208 #PBS -j oe
168 209 #PBS -l walltime=00:10:00
169 210 #PBS -l nodes=${n/4}:ppn=4
170 211 #PBS -q parallel
171 212
172 213 cd $$PBS_O_WORKDIR
173 214 export PATH=$$HOME/usr/local/bin
174 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
175 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
215 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
216 /usr/local/bin/mpiexec -n ${n} ipenginez --cluster_dir=${cluster_dir}
176 217
177 218 There are a few important points about this template:
178 219
179 220 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
180 221 template engine.
181 222
182 223 2. Instead of putting in the actual number of engines, use the notation
183 224 ``${n}`` to indicate the number of engines to be started. You can also uses
184 225 expressions like ``${n/4}`` in the template to indicate the number of
185 nodes.
226 nodes. There will always be a ${n} and ${cluster_dir} variable passed to the template.
227 These allow the batch system to know how many engines, and where the configuration
228 files reside.
186 229
187 230 3. Because ``$`` is a special character used by the template engine, you must
188 231 escape any ``$`` by using ``$$``. This is important when referring to
189 232 environment variables in the template.
190 233
191 4. Any options to :command:`ipenginez` should be given in the batch script
192 template.
234 4. Any options to :command:`ipenginez` can be given in the batch script
235 template, or in :file:`ipenginez_config.py`.
193 236
194 237 5. Depending on the configuration of you system, you may have to set
195 238 environment variables in the script template.
196 239
197 Once you have created such a script, save it with a name like
198 :file:`pbs.template`. Now you are ready to start your job::
240 The controller template should be similar, but simpler:
241
242 .. sourcecode:: bash
243
244 #PBS -N ipython
245 #PBS -j oe
246 #PBS -l walltime=00:10:00
247 #PBS -l nodes=1:ppn=4
248 #PBS -q parallel
199 249
200 $ ipclusterz pbs -n 128 --pbs-script=pbs.template
250 cd $$PBS_O_WORKDIR
251 export PATH=$$HOME/usr/local/bin
252 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
253 ipcontrollerz --cluster_dir=${cluster_dir}
201 254
202 Additional command line options for this mode can be found by doing::
203 255
204 $ ipclusterz pbs -h
256 Once you have created these scripts, save them with names like
257 :file:`pbs.engine.template`. Now you can load them into the :file:`ipclusterz_config` with:
205 258
206 Using :command:`ipclusterz` in SSH mode
207 --------------------------------------
259 .. sourcecode:: python
260
261 with open("pbs.engine.template") as f:
262 c.PBSEngineSetLauncher.batch_template = f.read()
263
264 with open("pbs.controller.template") as f:
265 c.PBSControllerLauncher.batch_template = f.read()
266
267
268 Alternately, you can just define the templates as strings inside :file:`ipclusterz_config`.
269
270 Note that assuming you are running PBS on a multi-node cluster, the Controller's default behavior
271 of listening only on localhost is likely too restrictive. In this case, also assuming the
272 nodes are safely behind a firewall, you can simply instruct the Controller to listen for
273 connections on all its interfaces, by adding in :file:`ipcontrollerz_config`:
274
275 .. sourcecode:: python
276
277 c.HubFactory.client_ip = '*'
278 c.HubFactory.engine_ip = '*'
279
280 You can now run the cluster with::
281
282 $ ipclusterz start -p pbs -n 128
283
284 Additional configuration options can be found in the PBS section of :file:`ipclusterz_config`.
208 285
209 286 .. note::
210 287
211 This section is out of date for IPython 0.11
288 Due to the flexibility of configuration, the PBS launchers work with simple changes
289 to the template for other :command:`qsub`-using systems, such as Sun Grid Engine,
290 and with further configuration in similar batch systems like Condor.
291
292
293 Using :command:`ipclusterz` in SSH mode
294 ---------------------------------------
212 295
213 296
214 297 The SSH mode uses :command:`ssh` to execute :command:`ipenginez` on remote
215 nodes and the :command:`ipcontrollerz` on localhost.
298 nodes and :command:`ipcontrollerz` can be run remotely as well, or on localhost.
216 299
217 When using using this mode it highly recommended that you have set up SSH keys
218 and are using ssh-agent [SSH]_ for password-less logins.
300 .. note::
219 301
220 To use this mode you need a python file describing the cluster, here is an
221 example of such a "clusterfile":
302 When using this mode it highly recommended that you have set up SSH keys
303 and are using ssh-agent [SSH]_ for password-less logins.
222 304
223 .. sourcecode:: python
224
225 send_furl = True
226 engines = { 'host1.example.com' : 2,
227 'host2.example.com' : 5,
228 'host3.example.com' : 1,
229 'host4.example.com' : 8 }
305 As usual, we start by creating a clean profile::
230 306
231 Since this is a regular python file usual python syntax applies. Things to
232 note:
307 $ ipclusterz create -p ssh
233 308
234 * The `engines` dict, where the keys is the host we want to run engines on and
235 the value is the number of engines to run on that host.
236 * send_furl can either be `True` or `False`, if `True` it will copy over the
237 furl needed for :command:`ipenginez` to each host.
309 To use this mode, select the SSH launchers in :file:`ipclusterz_config.py`:
238 310
239 The ``--clusterfile`` command line option lets you specify the file to use for
240 the cluster definition. Once you have your cluster file and you can
241 :command:`ssh` into the remote hosts with out an password you are ready to
242 start your cluster like so:
311 .. sourcecode:: python
243 312
244 .. sourcecode:: bash
313 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
314 # and if the Controller is also to be remote:
315 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.SSHControllerLauncher'
316
245 317
246 $ ipclusterz ssh --clusterfile /path/to/my/clusterfile.py
318 The controller's remote location and configuration can be specified:
247 319
320 .. sourcecode:: python
248 321
249 Two helper shell scripts are used to start and stop :command:`ipenginez` on
250 remote hosts:
322 # Set the user and hostname for the controller
323 # c.SSHControllerLauncher.hostname = 'controller.example.com'
324 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
251 325
252 * sshx.sh
253 * engine_killer.sh
326 # Set the arguments to be passed to ipcontrollerz
327 # note that remotely launched ipcontrollerz will not get the contents of
328 # the local ipcontrollerz_config.py unless it resides on the *remote host*
329 # in the location specified by the --cluster_dir argument.
330 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
254 331
255 Defaults for both of these are contained in the source code for
256 :command:`ipclusterz`. The default scripts are written to a local file in a
257 tmep directory and then copied to a temp directory on the remote host and
258 executed from there. On most Unix, Linux and OS X systems this is /tmp.
332 .. note::
259 333
260 The default sshx.sh is the following:
334 SSH mode does not do any file movement, so you will need to distribute configuration
335 files manually. To aid in this, the `reuse_files` flag defaults to True for ssh-launched
336 Controllers, so you will only need to do this once, unless you override this flag back
337 to False.
261 338
262 .. sourcecode:: bash
339 Engines are specified in a dictionary, by hostname and the number of engines to be run
340 on that host.
263 341
264 #!/bin/sh
265 "$@" &> /dev/null &
266 echo $!
342 .. sourcecode:: python
343
344 c.SSHEngineSetLauncher.engines = { 'host1.example.com' : 2,
345 'host2.example.com' : 5,
346 'host3.example.com' : (1, ['--cluster_dir', '/home/different/location']),
347 'host4.example.com' : 8 }
267 348
268 If you want to use a custom sshx.sh script you need to use the ``--sshx``
269 option and specify the file to use. Using a custom sshx.sh file could be
270 helpful when you need to setup the environment on the remote host before
271 executing :command:`ipenginez`.
349 * The `engines` dict, where the keys are the host we want to run engines on and
350 the value is the number of engines to run on that host.
351 * on host3, the value is a tuple, where the number of engines is first, and the arguments
352 to be passed to :command:`ipenginez` are the second element.
272 353
273 For a detailed options list:
354 For engines without explicitly specified arguments, the default arguments are set in
355 a single location:
274 356
275 .. sourcecode:: bash
357 .. sourcecode:: python
276 358
277 $ ipclusterz ssh -h
359 c.SSHEngineSetLauncher.engine_args = ['--cluster_dir', '/path/to/clusterz_ssh']
278 360
279 361 Current limitations of the SSH mode of :command:`ipclusterz` are:
280 362
281 363 * Untested on Windows. Would require a working :command:`ssh` on Windows.
282 364 Also, we are using shell scripts to setup and execute commands on remote
283 365 hosts.
284 * :command:`ipcontrollerz` is started on localhost, with no option to start it
285 on a remote node.
366 * No file movement -
286 367
287 368 Using the :command:`ipcontrollerz` and :command:`ipenginez` commands
288 369 ====================================================================
289 370
290 371 It is also possible to use the :command:`ipcontrollerz` and :command:`ipenginez`
291 372 commands to start your controller and engines. This approach gives you full
292 373 control over all aspects of the startup process.
293 374
294 375 Starting the controller and engine on your local machine
295 376 --------------------------------------------------------
296 377
297 378 To use :command:`ipcontrollerz` and :command:`ipenginez` to start things on your
298 379 local machine, do the following.
299 380
300 381 First start the controller::
301 382
302 $ ipcontrollerz
303
383 $ ipcontrollerz
384
304 385 Next, start however many instances of the engine you want using (repeatedly)
305 386 the command::
306 387
307 $ ipenginez
388 $ ipenginez
308 389
309 390 The engines should start and automatically connect to the controller using the
310 JSON files in :file:`~/.ipython/cluster_<profile>/security`. You are now ready to use the
391 JSON files in :file:`~/.ipython/clusterz_default/security`. You are now ready to use the
311 392 controller and engines from IPython.
312 393
313 394 .. warning::
314
315 The order of the above operations may be important. You *must*
316 start the controller before the engines, unless you are manually specifying
317 the ports on which to connect, in which case ordering is not important.
395
396 The order of the above operations may be important. You *must*
397 start the controller before the engines, unless you are reusing connection
398 information (via `-r`), in which case ordering is not important.
318 399
319 400 .. note::
320 401
321 402 On some platforms (OS X), to put the controller and engine into the
322 403 background you may need to give these commands in the form ``(ipcontroller
323 404 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
324 405 properly.
325 406
326 407 Starting the controller and engines on different hosts
327 408 ------------------------------------------------------
328 409
329 410 When the controller and engines are running on different hosts, things are
330 411 slightly more complicated, but the underlying ideas are the same:
331 412
332 413 1. Start the controller on a host using :command:`ipcontrollerz`.
333 414 2. Copy :file:`ipcontroller-engine.json` from :file:`~/.ipython/cluster_<profile>/security` on
334 415 the controller's host to the host where the engines will run.
335 416 3. Use :command:`ipenginez` on the engine's hosts to start the engines.
336 417
337 418 The only thing you have to be careful of is to tell :command:`ipenginez` where
338 419 the :file:`ipcontroller-engine.json` file is located. There are two ways you
339 420 can do this:
340 421
341 422 * Put :file:`ipcontroller-engine.json` in the :file:`~/.ipython/cluster_<profile>/security`
342 423 directory on the engine's host, where it will be found automatically.
343 424 * Call :command:`ipenginez` with the ``--file=full_path_to_the_file``
344 425 flag.
345 426
346 427 The ``--file`` flag works like this::
347 428
348 429 $ ipengine --file=/path/to/my/ipcontroller-engine.json
349 430
350 431 .. note::
351 432
352 433 If the controller's and engine's hosts all have a shared file system
353 434 (:file:`~/.ipython/cluster_<profile>/security` is the same on all of them), then things
354 435 will just work!
355 436
356 437 Make JSON files persistent
357 ---------------------------
438 --------------------------
358 439
359 440 At fist glance it may seem that that managing the JSON files is a bit
360 441 annoying. Going back to the house and key analogy, copying the JSON around
361 442 each time you start the controller is like having to make a new key every time
362 443 you want to unlock the door and enter your house. As with your house, you want
363 444 to be able to create the key (or JSON file) once, and then simply use it at
364 445 any point in the future.
365 446
366 This is possible, but before you do this, you **must** remove any old JSON
367 files in the :file:`~/.ipython/cluster_<profile>/security` directory.
368
369 .. warning::
370
371 You **must** remove old JSON files before using persistent JSON files.
372
373 Then, the only thing you have to do is specify the registration port, so that
447 To do this, the only thing you have to do is specify the `-r` flag, so that
374 448 the connection information in the JSON files remains accurate::
375 449
376 450 $ ipcontrollerz -r --regport 12345
377 451
378
379 452 Then, just copy the JSON files over the first time and you are set. You can
380 453 start and stop the controller and engines any many times as you want in the
381 future, just make sure to tell the controller to use the *same* ports.
454 future, just make sure to tell the controller to reuse the file.
382 455
383 456 .. note::
384 457
385 458 You may ask the question: what ports does the controller listen on if you
386 459 don't tell is to use specific ones? The default is to use high random port
387 460 numbers. We do this for two reasons: i) to increase security through
388 461 obscurity and ii) to multiple controllers on a given host to start and
389 462 automatically use different ports.
390 463
391 464 Log files
392 465 ---------
393 466
394 467 All of the components of IPython have log files associated with them.
395 468 These log files can be extremely useful in debugging problems with
396 469 IPython and can be found in the directory :file:`~/.ipython/cluster_<profile>/log`.
397 470 Sending the log files to us will often help us to debug any problems.
398 471
399 472
400 473 .. [PBS] Portable Batch System. http://www.openpbs.org/
401 474 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
402 475
476 Configuring `ipcontrollerz`
477 ---------------------------
478
479 .. note::
480
481 TODO
482
483 Configuring `ipenginez`
484 -----------------------
485
486 .. note::
487
488 TODO
489
403 490
General Comments 0
You need to be logged in to leave comments. Login now