##// END OF EJS Templates
Initial merge of ssh cluster from ~vvatsa's ipcluster-dev.
Brian Granger -
Show More
@@ -1,521 +1,658 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3
3
4 """Start an IPython cluster = (controller + engines)."""
4 """Start an IPython cluster = (controller + engines)."""
5
5
6 #-----------------------------------------------------------------------------
6 #-----------------------------------------------------------------------------
7 # Copyright (C) 2008 The IPython Development Team
7 # Copyright (C) 2008 The IPython Development Team
8 #
8 #
9 # Distributed under the terms of the BSD License. The full license is in
9 # Distributed under the terms of the BSD License. The full license is in
10 # the file COPYING, distributed as part of this software.
10 # the file COPYING, distributed as part of this software.
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12
12
13 #-----------------------------------------------------------------------------
13 #-----------------------------------------------------------------------------
14 # Imports
14 # Imports
15 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
16
16
17 import os
17 import os
18 import re
18 import re
19 import sys
19 import sys
20 import signal
20 import signal
21 import tempfile
21 pjoin = os.path.join
22 pjoin = os.path.join
22
23
23 from twisted.internet import reactor, defer
24 from twisted.internet import reactor, defer
24 from twisted.internet.protocol import ProcessProtocol
25 from twisted.internet.protocol import ProcessProtocol
25 from twisted.internet.error import ProcessDone, ProcessTerminated
26 from twisted.internet.error import ProcessDone, ProcessTerminated
26 from twisted.internet.utils import getProcessOutput
27 from twisted.internet.utils import getProcessOutput
27 from twisted.python import failure, log
28 from twisted.python import failure, log
28
29
29 from IPython.external import argparse
30 from IPython.external import argparse
30 from IPython.external import Itpl
31 from IPython.external import Itpl
31 from IPython.genutils import get_ipython_dir, num_cpus
32 from IPython.genutils import get_ipython_dir, num_cpus
32 from IPython.kernel.fcutil import have_crypto
33 from IPython.kernel.fcutil import have_crypto
33 from IPython.kernel.error import SecurityError
34 from IPython.kernel.error import SecurityError
34 from IPython.kernel.fcutil import have_crypto
35 from IPython.kernel.fcutil import have_crypto
35 from IPython.kernel.twistedutil import gatherBoth
36 from IPython.kernel.twistedutil import gatherBoth
36 from IPython.kernel.util import printer
37 from IPython.kernel.util import printer
37
38
38
39
39 #-----------------------------------------------------------------------------
40 #-----------------------------------------------------------------------------
40 # General process handling code
41 # General process handling code
41 #-----------------------------------------------------------------------------
42 #-----------------------------------------------------------------------------
42
43
43 def find_exe(cmd):
44 def find_exe(cmd):
44 try:
45 try:
45 import win32api
46 import win32api
46 except ImportError:
47 except ImportError:
47 raise ImportError('you need to have pywin32 installed for this to work')
48 raise ImportError('you need to have pywin32 installed for this to work')
48 else:
49 else:
49 try:
50 try:
50 (path, offest) = win32api.SearchPath(os.environ['PATH'],cmd + '.exe')
51 (path, offest) = win32api.SearchPath(os.environ['PATH'],cmd + '.exe')
51 except:
52 except:
52 (path, offset) = win32api.SearchPath(os.environ['PATH'],cmd + '.bat')
53 (path, offset) = win32api.SearchPath(os.environ['PATH'],cmd + '.bat')
53 return path
54 return path
54
55
55 class ProcessStateError(Exception):
56 class ProcessStateError(Exception):
56 pass
57 pass
57
58
58 class UnknownStatus(Exception):
59 class UnknownStatus(Exception):
59 pass
60 pass
60
61
61 class LauncherProcessProtocol(ProcessProtocol):
62 class LauncherProcessProtocol(ProcessProtocol):
62 """
63 """
63 A ProcessProtocol to go with the ProcessLauncher.
64 A ProcessProtocol to go with the ProcessLauncher.
64 """
65 """
65 def __init__(self, process_launcher):
66 def __init__(self, process_launcher):
66 self.process_launcher = process_launcher
67 self.process_launcher = process_launcher
67
68
68 def connectionMade(self):
69 def connectionMade(self):
69 self.process_launcher.fire_start_deferred(self.transport.pid)
70 self.process_launcher.fire_start_deferred(self.transport.pid)
70
71
71 def processEnded(self, status):
72 def processEnded(self, status):
72 value = status.value
73 value = status.value
73 if isinstance(value, ProcessDone):
74 if isinstance(value, ProcessDone):
74 self.process_launcher.fire_stop_deferred(0)
75 self.process_launcher.fire_stop_deferred(0)
75 elif isinstance(value, ProcessTerminated):
76 elif isinstance(value, ProcessTerminated):
76 self.process_launcher.fire_stop_deferred(
77 self.process_launcher.fire_stop_deferred(
77 {'exit_code':value.exitCode,
78 {'exit_code':value.exitCode,
78 'signal':value.signal,
79 'signal':value.signal,
79 'status':value.status
80 'status':value.status
80 }
81 }
81 )
82 )
82 else:
83 else:
83 raise UnknownStatus("unknown exit status, this is probably a bug in Twisted")
84 raise UnknownStatus("unknown exit status, this is probably a bug in Twisted")
84
85
85 def outReceived(self, data):
86 def outReceived(self, data):
86 log.msg(data)
87 log.msg(data)
87
88
88 def errReceived(self, data):
89 def errReceived(self, data):
89 log.err(data)
90 log.err(data)
90
91
91 class ProcessLauncher(object):
92 class ProcessLauncher(object):
92 """
93 """
93 Start and stop an external process in an asynchronous manner.
94 Start and stop an external process in an asynchronous manner.
94
95
95 Currently this uses deferreds to notify other parties of process state
96 Currently this uses deferreds to notify other parties of process state
96 changes. This is an awkward design and should be moved to using
97 changes. This is an awkward design and should be moved to using
97 a formal NotificationCenter.
98 a formal NotificationCenter.
98 """
99 """
99 def __init__(self, cmd_and_args):
100 def __init__(self, cmd_and_args):
100 self.cmd = cmd_and_args[0]
101 self.cmd = cmd_and_args[0]
101 self.args = cmd_and_args
102 self.args = cmd_and_args
102 self._reset()
103 self._reset()
103
104
104 def _reset(self):
105 def _reset(self):
105 self.process_protocol = None
106 self.process_protocol = None
106 self.pid = None
107 self.pid = None
107 self.start_deferred = None
108 self.start_deferred = None
108 self.stop_deferreds = []
109 self.stop_deferreds = []
109 self.state = 'before' # before, running, or after
110 self.state = 'before' # before, running, or after
110
111
111 @property
112 @property
112 def running(self):
113 def running(self):
113 if self.state == 'running':
114 if self.state == 'running':
114 return True
115 return True
115 else:
116 else:
116 return False
117 return False
117
118
118 def fire_start_deferred(self, pid):
119 def fire_start_deferred(self, pid):
119 self.pid = pid
120 self.pid = pid
120 self.state = 'running'
121 self.state = 'running'
121 log.msg('Process %r has started with pid=%i' % (self.args, pid))
122 log.msg('Process %r has started with pid=%i' % (self.args, pid))
122 self.start_deferred.callback(pid)
123 self.start_deferred.callback(pid)
123
124
124 def start(self):
125 def start(self):
125 if self.state == 'before':
126 if self.state == 'before':
126 self.process_protocol = LauncherProcessProtocol(self)
127 self.process_protocol = LauncherProcessProtocol(self)
127 self.start_deferred = defer.Deferred()
128 self.start_deferred = defer.Deferred()
128 self.process_transport = reactor.spawnProcess(
129 self.process_transport = reactor.spawnProcess(
129 self.process_protocol,
130 self.process_protocol,
130 self.cmd,
131 self.cmd,
131 self.args,
132 self.args,
132 env=os.environ
133 env=os.environ
133 )
134 )
134 return self.start_deferred
135 return self.start_deferred
135 else:
136 else:
136 s = 'the process has already been started and has state: %r' % \
137 s = 'the process has already been started and has state: %r' % \
137 self.state
138 self.state
138 return defer.fail(ProcessStateError(s))
139 return defer.fail(ProcessStateError(s))
139
140
140 def get_stop_deferred(self):
141 def get_stop_deferred(self):
141 if self.state == 'running' or self.state == 'before':
142 if self.state == 'running' or self.state == 'before':
142 d = defer.Deferred()
143 d = defer.Deferred()
143 self.stop_deferreds.append(d)
144 self.stop_deferreds.append(d)
144 return d
145 return d
145 else:
146 else:
146 s = 'this process is already complete'
147 s = 'this process is already complete'
147 return defer.fail(ProcessStateError(s))
148 return defer.fail(ProcessStateError(s))
148
149
149 def fire_stop_deferred(self, exit_code):
150 def fire_stop_deferred(self, exit_code):
150 log.msg('Process %r has stopped with %r' % (self.args, exit_code))
151 log.msg('Process %r has stopped with %r' % (self.args, exit_code))
151 self.state = 'after'
152 self.state = 'after'
152 for d in self.stop_deferreds:
153 for d in self.stop_deferreds:
153 d.callback(exit_code)
154 d.callback(exit_code)
154
155
155 def signal(self, sig):
156 def signal(self, sig):
156 """
157 """
157 Send a signal to the process.
158 Send a signal to the process.
158
159
159 The argument sig can be ('KILL','INT', etc.) or any signal number.
160 The argument sig can be ('KILL','INT', etc.) or any signal number.
160 """
161 """
161 if self.state == 'running':
162 if self.state == 'running':
162 self.process_transport.signalProcess(sig)
163 self.process_transport.signalProcess(sig)
163
164
164 # def __del__(self):
165 # def __del__(self):
165 # self.signal('KILL')
166 # self.signal('KILL')
166
167
167 def interrupt_then_kill(self, delay=1.0):
168 def interrupt_then_kill(self, delay=1.0):
168 self.signal('INT')
169 self.signal('INT')
169 reactor.callLater(delay, self.signal, 'KILL')
170 reactor.callLater(delay, self.signal, 'KILL')
170
171
171
172
172 #-----------------------------------------------------------------------------
173 #-----------------------------------------------------------------------------
173 # Code for launching controller and engines
174 # Code for launching controller and engines
174 #-----------------------------------------------------------------------------
175 #-----------------------------------------------------------------------------
175
176
176
177
177 class ControllerLauncher(ProcessLauncher):
178 class ControllerLauncher(ProcessLauncher):
178
179
179 def __init__(self, extra_args=None):
180 def __init__(self, extra_args=None):
180 if sys.platform == 'win32':
181 if sys.platform == 'win32':
181 # This logic is needed because the ipcontroller script doesn't
182 # This logic is needed because the ipcontroller script doesn't
182 # always get installed in the same way or in the same location.
183 # always get installed in the same way or in the same location.
183 from IPython.kernel.scripts import ipcontroller
184 from IPython.kernel.scripts import ipcontroller
184 script_location = ipcontroller.__file__.replace('.pyc', '.py')
185 script_location = ipcontroller.__file__.replace('.pyc', '.py')
185 # The -u option here turns on unbuffered output, which is required
186 # The -u option here turns on unbuffered output, which is required
186 # on Win32 to prevent wierd conflict and problems with Twisted
187 # on Win32 to prevent wierd conflict and problems with Twisted
187 args = [find_exe('python'), '-u', script_location]
188 args = [find_exe('python'), '-u', script_location]
188 else:
189 else:
189 args = ['ipcontroller']
190 args = ['ipcontroller']
190 self.extra_args = extra_args
191 self.extra_args = extra_args
191 if extra_args is not None:
192 if extra_args is not None:
192 args.extend(extra_args)
193 args.extend(extra_args)
193
194
194 ProcessLauncher.__init__(self, args)
195 ProcessLauncher.__init__(self, args)
195
196
196
197
197 class EngineLauncher(ProcessLauncher):
198 class EngineLauncher(ProcessLauncher):
198
199
199 def __init__(self, extra_args=None):
200 def __init__(self, extra_args=None):
200 if sys.platform == 'win32':
201 if sys.platform == 'win32':
201 # This logic is needed because the ipcontroller script doesn't
202 # This logic is needed because the ipcontroller script doesn't
202 # always get installed in the same way or in the same location.
203 # always get installed in the same way or in the same location.
203 from IPython.kernel.scripts import ipengine
204 from IPython.kernel.scripts import ipengine
204 script_location = ipengine.__file__.replace('.pyc', '.py')
205 script_location = ipengine.__file__.replace('.pyc', '.py')
205 # The -u option here turns on unbuffered output, which is required
206 # The -u option here turns on unbuffered output, which is required
206 # on Win32 to prevent wierd conflict and problems with Twisted
207 # on Win32 to prevent wierd conflict and problems with Twisted
207 args = [find_exe('python'), '-u', script_location]
208 args = [find_exe('python'), '-u', script_location]
208 else:
209 else:
209 args = ['ipengine']
210 args = ['ipengine']
210 self.extra_args = extra_args
211 self.extra_args = extra_args
211 if extra_args is not None:
212 if extra_args is not None:
212 args.extend(extra_args)
213 args.extend(extra_args)
213
214
214 ProcessLauncher.__init__(self, args)
215 ProcessLauncher.__init__(self, args)
215
216
216
217
217 class LocalEngineSet(object):
218 class LocalEngineSet(object):
218
219
219 def __init__(self, extra_args=None):
220 def __init__(self, extra_args=None):
220 self.extra_args = extra_args
221 self.extra_args = extra_args
221 self.launchers = []
222 self.launchers = []
222
223
223 def start(self, n):
224 def start(self, n):
224 dlist = []
225 dlist = []
225 for i in range(n):
226 for i in range(n):
226 el = EngineLauncher(extra_args=self.extra_args)
227 el = EngineLauncher(extra_args=self.extra_args)
227 d = el.start()
228 d = el.start()
228 self.launchers.append(el)
229 self.launchers.append(el)
229 dlist.append(d)
230 dlist.append(d)
230 dfinal = gatherBoth(dlist, consumeErrors=True)
231 dfinal = gatherBoth(dlist, consumeErrors=True)
231 dfinal.addCallback(self._handle_start)
232 dfinal.addCallback(self._handle_start)
232 return dfinal
233 return dfinal
233
234
234 def _handle_start(self, r):
235 def _handle_start(self, r):
235 log.msg('Engines started with pids: %r' % r)
236 log.msg('Engines started with pids: %r' % r)
236 return r
237 return r
237
238
238 def _handle_stop(self, r):
239 def _handle_stop(self, r):
239 log.msg('Engines received signal: %r' % r)
240 log.msg('Engines received signal: %r' % r)
240 return r
241 return r
241
242
242 def signal(self, sig):
243 def signal(self, sig):
243 dlist = []
244 dlist = []
244 for el in self.launchers:
245 for el in self.launchers:
245 d = el.get_stop_deferred()
246 d = el.get_stop_deferred()
246 dlist.append(d)
247 dlist.append(d)
247 el.signal(sig)
248 el.signal(sig)
248 dfinal = gatherBoth(dlist, consumeErrors=True)
249 dfinal = gatherBoth(dlist, consumeErrors=True)
249 dfinal.addCallback(self._handle_stop)
250 dfinal.addCallback(self._handle_stop)
250 return dfinal
251 return dfinal
251
252
252 def interrupt_then_kill(self, delay=1.0):
253 def interrupt_then_kill(self, delay=1.0):
253 dlist = []
254 dlist = []
254 for el in self.launchers:
255 for el in self.launchers:
255 d = el.get_stop_deferred()
256 d = el.get_stop_deferred()
256 dlist.append(d)
257 dlist.append(d)
257 el.interrupt_then_kill(delay)
258 el.interrupt_then_kill(delay)
258 dfinal = gatherBoth(dlist, consumeErrors=True)
259 dfinal = gatherBoth(dlist, consumeErrors=True)
259 dfinal.addCallback(self._handle_stop)
260 dfinal.addCallback(self._handle_stop)
260 return dfinal
261 return dfinal
261
262
262
263
263 class BatchEngineSet(object):
264 class BatchEngineSet(object):
264
265
265 # Subclasses must fill these in. See PBSEngineSet
266 # Subclasses must fill these in. See PBSEngineSet
266 submit_command = ''
267 submit_command = ''
267 delete_command = ''
268 delete_command = ''
268 job_id_regexp = ''
269 job_id_regexp = ''
269
270
270 def __init__(self, template_file, **kwargs):
271 def __init__(self, template_file, **kwargs):
271 self.template_file = template_file
272 self.template_file = template_file
272 self.context = {}
273 self.context = {}
273 self.context.update(kwargs)
274 self.context.update(kwargs)
274 self.batch_file = self.template_file+'-run'
275 self.batch_file = self.template_file+'-run'
275
276
276 def parse_job_id(self, output):
277 def parse_job_id(self, output):
277 m = re.match(self.job_id_regexp, output)
278 m = re.match(self.job_id_regexp, output)
278 if m is not None:
279 if m is not None:
279 job_id = m.group()
280 job_id = m.group()
280 else:
281 else:
281 raise Exception("job id couldn't be determined: %s" % output)
282 raise Exception("job id couldn't be determined: %s" % output)
282 self.job_id = job_id
283 self.job_id = job_id
283 log.msg('Job started with job id: %r' % job_id)
284 log.msg('Job started with job id: %r' % job_id)
284 return job_id
285 return job_id
285
286
286 def write_batch_script(self, n):
287 def write_batch_script(self, n):
287 self.context['n'] = n
288 self.context['n'] = n
288 template = open(self.template_file, 'r').read()
289 template = open(self.template_file, 'r').read()
289 log.msg('Using template for batch script: %s' % self.template_file)
290 log.msg('Using template for batch script: %s' % self.template_file)
290 script_as_string = Itpl.itplns(template, self.context)
291 script_as_string = Itpl.itplns(template, self.context)
291 log.msg('Writing instantiated batch script: %s' % self.batch_file)
292 log.msg('Writing instantiated batch script: %s' % self.batch_file)
292 f = open(self.batch_file,'w')
293 f = open(self.batch_file,'w')
293 f.write(script_as_string)
294 f.write(script_as_string)
294 f.close()
295 f.close()
295
296
296 def handle_error(self, f):
297 def handle_error(self, f):
297 f.printTraceback()
298 f.printTraceback()
298 f.raiseException()
299 f.raiseException()
299
300
300 def start(self, n):
301 def start(self, n):
301 self.write_batch_script(n)
302 self.write_batch_script(n)
302 d = getProcessOutput(self.submit_command,
303 d = getProcessOutput(self.submit_command,
303 [self.batch_file],env=os.environ)
304 [self.batch_file],env=os.environ)
304 d.addCallback(self.parse_job_id)
305 d.addCallback(self.parse_job_id)
305 d.addErrback(self.handle_error)
306 d.addErrback(self.handle_error)
306 return d
307 return d
307
308
308 def kill(self):
309 def kill(self):
309 d = getProcessOutput(self.delete_command,
310 d = getProcessOutput(self.delete_command,
310 [self.job_id],env=os.environ)
311 [self.job_id],env=os.environ)
311 return d
312 return d
312
313
313 class PBSEngineSet(BatchEngineSet):
314 class PBSEngineSet(BatchEngineSet):
314
315
315 submit_command = 'qsub'
316 submit_command = 'qsub'
316 delete_command = 'qdel'
317 delete_command = 'qdel'
317 job_id_regexp = '\d+'
318 job_id_regexp = '\d+'
318
319
319 def __init__(self, template_file, **kwargs):
320 def __init__(self, template_file, **kwargs):
320 BatchEngineSet.__init__(self, template_file, **kwargs)
321 BatchEngineSet.__init__(self, template_file, **kwargs)
321
322
323 class SSHEngineSet(object):
324 sshx_template="""#!/bin/sh
325 "$@" &> /dev/null &
326 echo $!"""
327
328 engine_killer_template="""#!/bin/sh
329
330 ps -fu `whoami` | grep ipengine | awk '{print $2}' | xargs kill -TERM"""
331
332 def __init__(self, engine_hosts, sshx=None, ipengine="ipengine"):
333 self.temp_dir = tempfile.gettempdir()
334 if sshx != None:
335 self.sshx = sshx
336 else:
337 self.sshx = os.path.join(self.temp_dir, '%s-main-sshx.sh'%os.environ['USER'])
338 f = open(self.sshx, 'w')
339 f.writelines(self.sshx_template)
340 f.close()
341 self.engine_command = ipengine
342 self.engine_hosts = engine_hosts
343 self.engine_killer = os.path.join(self.temp_dir, '%s-main-engine_killer.sh'%os.environ['USER'])
344 f = open(self.engine_killer, 'w')
345 f.writelines(self.engine_killer_template)
346 f.close()
347
348 def start(self, send_furl=False):
349 for host in self.engine_hosts.keys():
350 count = self.engine_hosts[host]
351 self._start(host, count, send_furl)
352
353 def killall(self):
354 for host in self.engine_hosts.keys():
355 self._killall(host)
356
357 def _start(self, host_name, count=1, send_furl=False):
358
359 def _scp_sshx(d):
360 scp_cmd = "scp %s %s:%s/%s-sshx.sh"%(self.sshx, host_name, self.temp_dir, os.environ['USER'])
361 sshx_scp = scp_cmd.split()
362 print sshx_scp
363 d = getProcessOutput(sshx_scp[0], sshx_scp[1:], env=os.environ)
364 d.addCallback(_exec_engine)
365
366 def _exec_engine(d):
367 exec_engine = "ssh %s sh %s/%s-sshx.sh %s"%(host_name, self.temp_dir, os.environ['USER'], self.engine_command)
368 cmds = exec_engine.split()
369 print cmds
370 for i in range(count):
371 d = getProcessOutput(cmds[0], cmds[1:], env=os.environ)
372
373 if send_furl:
374 scp_cmd = "scp ~/.ipython/security/ipcontroller-engine.furl %s:.ipython/security/"%(host_name)
375 cmd_list = scp_cmd.split()
376 cmd_list[1] = os.path.expanduser(cmd_list[1])
377 print cmd_list
378 d = getProcessOutput(cmd_list[0], cmd_list[1:], env=os.environ)
379 d.addCallback(_scp_sshx)
380 else:
381 _scp_sshx(d=None)
382
383 def _killall(self, host_name):
384 def _exec_err(d):
385 if d.getErrorMessage()[-18:] != "No such process\\n\'":
386 raise d
387
388 def _exec_kill(d):
389 kill_cmd = "ssh %s sh %s/%s-engine_killer.sh"%( host_name, self.temp_dir, os.environ['USER'])
390 kill_cmd = kill_cmd.split()
391 print kill_cmd
392 d = getProcessOutput(kill_cmd[0], kill_cmd[1:], env=os.environ)
393 d.addErrback(_exec_err)
394 scp_cmd = "scp %s %s:%s/%s-engine_killer.sh"%( self.engine_killer, host_name, self.temp_dir, os.environ['USER'])
395 cmds = scp_cmd.split()
396 d = getProcessOutput(cmds[0], cmds[1:], env=os.environ)
397 d.addCallback(_exec_kill)
398 d.addErrback(_exec_err)
399
322
400
323 #-----------------------------------------------------------------------------
401 #-----------------------------------------------------------------------------
324 # Main functions for the different types of clusters
402 # Main functions for the different types of clusters
325 #-----------------------------------------------------------------------------
403 #-----------------------------------------------------------------------------
326
404
327 # TODO:
405 # TODO:
328 # The logic in these codes should be moved into classes like LocalCluster
406 # The logic in these codes should be moved into classes like LocalCluster
329 # MpirunCluster, PBSCluster, etc. This would remove alot of the duplications.
407 # MpirunCluster, PBSCluster, etc. This would remove alot of the duplications.
330 # The main functions should then just parse the command line arguments, create
408 # The main functions should then just parse the command line arguments, create
331 # the appropriate class and call a 'start' method.
409 # the appropriate class and call a 'start' method.
332
410
333 def check_security(args, cont_args):
411 def check_security(args, cont_args):
334 if (not args.x or not args.y) and not have_crypto:
412 if (not args.x or not args.y) and not have_crypto:
335 log.err("""
413 log.err("""
336 OpenSSL/pyOpenSSL is not available, so we can't run in secure mode.
414 OpenSSL/pyOpenSSL is not available, so we can't run in secure mode.
337 Try running ipcluster with the -xy flags: ipcluster local -xy -n 4""")
415 Try running ipcluster with the -xy flags: ipcluster local -xy -n 4""")
338 reactor.stop()
416 reactor.stop()
339 return False
417 return False
340 if args.x:
418 if args.x:
341 cont_args.append('-x')
419 cont_args.append('-x')
342 if args.y:
420 if args.y:
343 cont_args.append('-y')
421 cont_args.append('-y')
344 return True
422 return True
345
423
424
346 def main_local(args):
425 def main_local(args):
347 cont_args = []
426 cont_args = []
348 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
427 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
349
428
350 # Check security settings before proceeding
429 # Check security settings before proceeding
351 if not check_security(args, cont_args):
430 if not check_security(args, cont_args):
352 return
431 return
353
432
354 cl = ControllerLauncher(extra_args=cont_args)
433 cl = ControllerLauncher(extra_args=cont_args)
355 dstart = cl.start()
434 dstart = cl.start()
356 def start_engines(cont_pid):
435 def start_engines(cont_pid):
357 engine_args = []
436 engine_args = []
358 engine_args.append('--logfile=%s' % \
437 engine_args.append('--logfile=%s' % \
359 pjoin(args.logdir,'ipengine%s-' % cont_pid))
438 pjoin(args.logdir,'ipengine%s-' % cont_pid))
360 eset = LocalEngineSet(extra_args=engine_args)
439 eset = LocalEngineSet(extra_args=engine_args)
361 def shutdown(signum, frame):
440 def shutdown(signum, frame):
362 log.msg('Stopping local cluster')
441 log.msg('Stopping local cluster')
363 # We are still playing with the times here, but these seem
442 # We are still playing with the times here, but these seem
364 # to be reliable in allowing everything to exit cleanly.
443 # to be reliable in allowing everything to exit cleanly.
365 eset.interrupt_then_kill(0.5)
444 eset.interrupt_then_kill(0.5)
366 cl.interrupt_then_kill(0.5)
445 cl.interrupt_then_kill(0.5)
367 reactor.callLater(1.0, reactor.stop)
446 reactor.callLater(1.0, reactor.stop)
368 signal.signal(signal.SIGINT,shutdown)
447 signal.signal(signal.SIGINT,shutdown)
369 d = eset.start(args.n)
448 d = eset.start(args.n)
370 return d
449 return d
371 def delay_start(cont_pid):
450 def delay_start(cont_pid):
372 # This is needed because the controller doesn't start listening
451 # This is needed because the controller doesn't start listening
373 # right when it starts and the controller needs to write
452 # right when it starts and the controller needs to write
374 # furl files for the engine to pick up
453 # furl files for the engine to pick up
375 reactor.callLater(1.0, start_engines, cont_pid)
454 reactor.callLater(1.0, start_engines, cont_pid)
376 dstart.addCallback(delay_start)
455 dstart.addCallback(delay_start)
377 dstart.addErrback(lambda f: f.raiseException())
456 dstart.addErrback(lambda f: f.raiseException())
378
457
458
379 def main_mpirun(args):
459 def main_mpirun(args):
380 cont_args = []
460 cont_args = []
381 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
461 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
382
462
383 # Check security settings before proceeding
463 # Check security settings before proceeding
384 if not check_security(args, cont_args):
464 if not check_security(args, cont_args):
385 return
465 return
386
466
387 cl = ControllerLauncher(extra_args=cont_args)
467 cl = ControllerLauncher(extra_args=cont_args)
388 dstart = cl.start()
468 dstart = cl.start()
389 def start_engines(cont_pid):
469 def start_engines(cont_pid):
390 raw_args = ['mpirun']
470 raw_args = ['mpirun']
391 raw_args.extend(['-n',str(args.n)])
471 raw_args.extend(['-n',str(args.n)])
392 raw_args.append('ipengine')
472 raw_args.append('ipengine')
393 raw_args.append('-l')
473 raw_args.append('-l')
394 raw_args.append(pjoin(args.logdir,'ipengine%s-' % cont_pid))
474 raw_args.append(pjoin(args.logdir,'ipengine%s-' % cont_pid))
395 if args.mpi:
475 if args.mpi:
396 raw_args.append('--mpi=%s' % args.mpi)
476 raw_args.append('--mpi=%s' % args.mpi)
397 eset = ProcessLauncher(raw_args)
477 eset = ProcessLauncher(raw_args)
398 def shutdown(signum, frame):
478 def shutdown(signum, frame):
399 log.msg('Stopping local cluster')
479 log.msg('Stopping local cluster')
400 # We are still playing with the times here, but these seem
480 # We are still playing with the times here, but these seem
401 # to be reliable in allowing everything to exit cleanly.
481 # to be reliable in allowing everything to exit cleanly.
402 eset.interrupt_then_kill(1.0)
482 eset.interrupt_then_kill(1.0)
403 cl.interrupt_then_kill(1.0)
483 cl.interrupt_then_kill(1.0)
404 reactor.callLater(2.0, reactor.stop)
484 reactor.callLater(2.0, reactor.stop)
405 signal.signal(signal.SIGINT,shutdown)
485 signal.signal(signal.SIGINT,shutdown)
406 d = eset.start()
486 d = eset.start()
407 return d
487 return d
408 def delay_start(cont_pid):
488 def delay_start(cont_pid):
409 # This is needed because the controller doesn't start listening
489 # This is needed because the controller doesn't start listening
410 # right when it starts and the controller needs to write
490 # right when it starts and the controller needs to write
411 # furl files for the engine to pick up
491 # furl files for the engine to pick up
412 reactor.callLater(1.0, start_engines, cont_pid)
492 reactor.callLater(1.0, start_engines, cont_pid)
413 dstart.addCallback(delay_start)
493 dstart.addCallback(delay_start)
414 dstart.addErrback(lambda f: f.raiseException())
494 dstart.addErrback(lambda f: f.raiseException())
415
495
496
416 def main_pbs(args):
497 def main_pbs(args):
417 cont_args = []
498 cont_args = []
418 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
499 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
419
500
420 # Check security settings before proceeding
501 # Check security settings before proceeding
421 if not check_security(args, cont_args):
502 if not check_security(args, cont_args):
422 return
503 return
423
504
424 cl = ControllerLauncher(extra_args=cont_args)
505 cl = ControllerLauncher(extra_args=cont_args)
425 dstart = cl.start()
506 dstart = cl.start()
426 def start_engines(r):
507 def start_engines(r):
427 pbs_set = PBSEngineSet(args.pbsscript)
508 pbs_set = PBSEngineSet(args.pbsscript)
428 def shutdown(signum, frame):
509 def shutdown(signum, frame):
429 log.msg('Stopping pbs cluster')
510 log.msg('Stopping pbs cluster')
430 d = pbs_set.kill()
511 d = pbs_set.kill()
431 d.addBoth(lambda _: cl.interrupt_then_kill(1.0))
512 d.addBoth(lambda _: cl.interrupt_then_kill(1.0))
432 d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop))
513 d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop))
433 signal.signal(signal.SIGINT,shutdown)
514 signal.signal(signal.SIGINT,shutdown)
434 d = pbs_set.start(args.n)
515 d = pbs_set.start(args.n)
435 return d
516 return d
436 dstart.addCallback(start_engines)
517 dstart.addCallback(start_engines)
437 dstart.addErrback(lambda f: f.raiseException())
518 dstart.addErrback(lambda f: f.raiseException())
438
519
439
520
521 # currently the ssh launcher only launches the controller on localhost.
522 def main_ssh(args):
523 # the clusterfile should look like:
524 # send_furl = False # True, if you want
525 # engines = {'engine_host1' : engine_count, 'engine_host2' : engine_count2}
526 clusterfile = {}
527 execfile(args.clusterfile, clusterfile)
528 if not clusterfile.has_key('send_furl'):
529 clusterfile['send_furl'] = False
530
531 cont_args = []
532 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
533 if args.x:
534 cont_args.append('-x')
535 if args.y:
536 cont_args.append('-y')
537 cl = ControllerLauncher(extra_args=cont_args)
538 dstart = cl.start()
539 def start_engines(cont_pid):
540 est = SSHEngineSet(clusterfile['engines'], sshx=args.sshx)
541 est.start(clusterfile['send_furl'])
542 def shutdown(signum, frame):
543 est.killall()
544 cl.interrupt_then_kill(0.5)
545 reactor.callLater(2.0, reactor.stop)
546 signal.signal(signal.SIGINT,shutdown)
547
548 def delay_start(cont_pid):
549 reactor.callLater(1.0, start_engines, cont_pid)
550
551 dstart.addCallback(delay_start)
552 dstart.addErrback(lambda f: f.raiseException())
553
554
440 def get_args():
555 def get_args():
441 base_parser = argparse.ArgumentParser(add_help=False)
556 base_parser = argparse.ArgumentParser(add_help=False)
442 base_parser.add_argument(
557 base_parser.add_argument(
443 '-x',
558 '-x',
444 action='store_true',
559 action='store_true',
445 dest='x',
560 dest='x',
446 help='turn off client security'
561 help='turn off client security'
447 )
562 )
448 base_parser.add_argument(
563 base_parser.add_argument(
449 '-y',
564 '-y',
450 action='store_true',
565 action='store_true',
451 dest='y',
566 dest='y',
452 help='turn off engine security'
567 help='turn off engine security'
453 )
568 )
454 base_parser.add_argument(
569 base_parser.add_argument(
455 "--logdir",
570 "--logdir",
456 type=str,
571 type=str,
457 dest="logdir",
572 dest="logdir",
458 help="directory to put log files (default=$IPYTHONDIR/log)",
573 help="directory to put log files (default=$IPYTHONDIR/log)",
459 default=pjoin(get_ipython_dir(),'log')
574 default=pjoin(get_ipython_dir(),'log')
460 )
575 )
461 base_parser.add_argument(
576 base_parser.add_argument(
462 "-n",
577 "-n",
463 "--num",
578 "--num",
464 type=int,
579 type=int,
465 dest="n",
580 dest="n",
466 default=2,
581 default=2,
467 help="the number of engines to start"
582 help="the number of engines to start"
468 )
583 )
469
584
470 parser = argparse.ArgumentParser(
585 parser = argparse.ArgumentParser(
471 description='IPython cluster startup. This starts a controller and\
586 description='IPython cluster startup. This starts a controller and\
472 engines using various approaches. THIS IS A TECHNOLOGY PREVIEW AND\
587 engines using various approaches. THIS IS A TECHNOLOGY PREVIEW AND\
473 THE API WILL CHANGE SIGNIFICANTLY BEFORE THE FINAL RELEASE.'
588 THE API WILL CHANGE SIGNIFICANTLY BEFORE THE FINAL RELEASE.'
474 )
589 )
475 subparsers = parser.add_subparsers(
590 subparsers = parser.add_subparsers(
476 help='available cluster types. For help, do "ipcluster TYPE --help"')
591 help='available cluster types. For help, do "ipcluster TYPE --help"')
477
592
478 parser_local = subparsers.add_parser(
593 parser_local = subparsers.add_parser(
479 'local',
594 'local',
480 help='run a local cluster',
595 help='run a local cluster',
481 parents=[base_parser]
596 parents=[base_parser]
482 )
597 )
483 parser_local.set_defaults(func=main_local)
598 parser_local.set_defaults(func=main_local)
484
599
485 parser_mpirun = subparsers.add_parser(
600 parser_mpirun = subparsers.add_parser(
486 'mpirun',
601 'mpirun',
487 help='run a cluster using mpirun',
602 help='run a cluster using mpirun',
488 parents=[base_parser]
603 parents=[base_parser]
489 )
604 )
490 parser_mpirun.add_argument(
605 parser_mpirun.add_argument(
491 "--mpi",
606 "--mpi",
492 type=str,
607 type=str,
493 dest="mpi", # Don't put a default here to allow no MPI support
608 dest="mpi", # Don't put a default here to allow no MPI support
494 help="how to call MPI_Init (default=mpi4py)"
609 help="how to call MPI_Init (default=mpi4py)"
495 )
610 )
496 parser_mpirun.set_defaults(func=main_mpirun)
611 parser_mpirun.set_defaults(func=main_mpirun)
497
612
498 parser_pbs = subparsers.add_parser(
613 parser_pbs = subparsers.add_parser(
499 'pbs',
614 'pbs',
500 help='run a pbs cluster',
615 help='run a pbs cluster',
501 parents=[base_parser]
616 parents=[base_parser]
502 )
617 )
503 parser_pbs.add_argument(
618 parser_pbs.add_argument(
504 '--pbs-script',
619 '--pbs-script',
505 type=str,
620 type=str,
506 dest='pbsscript',
621 dest='pbsscript',
507 help='PBS script template',
622 help='PBS script template',
508 default='pbs.template'
623 default='pbs.template'
509 )
624 )
510 parser_pbs.set_defaults(func=main_pbs)
625 parser_pbs.set_defaults(func=main_pbs)
626
627 parser_ssh = subparsers.add_parser(
628 'ssh',
629 help='run a cluster using ssh, should have ssh-keys setup',
630 parents=[base_parser]
631 )
632 parser_ssh.add_argument(
633 '--clusterfile',
634 type=str,
635 dest='clusterfile',
636 help='python file describing the cluster',
637 default='clusterfile.py',
638 )
639 parser_ssh.add_argument(
640 '--sshx',
641 type=str,
642 dest='sshx',
643 help='sshx launcher helper',
644 default='sshx.sh',
645 )
646 parser_ssh.set_defaults(func=main_ssh)
647
511 args = parser.parse_args()
648 args = parser.parse_args()
512 return args
649 return args
513
650
514 def main():
651 def main():
515 args = get_args()
652 args = get_args()
516 reactor.callWhenRunning(args.func, args)
653 reactor.callWhenRunning(args.func, args)
517 log.startLogging(sys.stdout)
654 log.startLogging(sys.stdout)
518 reactor.run()
655 reactor.run()
519
656
520 if __name__ == '__main__':
657 if __name__ == '__main__':
521 main()
658 main()
@@ -1,251 +1,326 b''
1 .. _parallel_process:
1 .. _parallel_process:
2
2
3 ===========================================
3 ===========================================
4 Starting the IPython controller and engines
4 Starting the IPython controller and engines
5 ===========================================
5 ===========================================
6
6
7 To use IPython for parallel computing, you need to start one instance of
7 To use IPython for parallel computing, you need to start one instance of
8 the controller and one or more instances of the engine. The controller
8 the controller and one or more instances of the engine. The controller
9 and each engine can run on different machines or on the same machine.
9 and each engine can run on different machines or on the same machine.
10 Because of this, there are many different possibilities.
10 Because of this, there are many different possibilities.
11
11
12 Broadly speaking, there are two ways of going about starting a controller and engines:
12 Broadly speaking, there are two ways of going about starting a controller and engines:
13
13
14 * In an automated manner using the :command:`ipcluster` command.
14 * In an automated manner using the :command:`ipcluster` command.
15 * In a more manual way using the :command:`ipcontroller` and
15 * In a more manual way using the :command:`ipcontroller` and
16 :command:`ipengine` commands.
16 :command:`ipengine` commands.
17
17
18 This document describes both of these methods. We recommend that new users start with the :command:`ipcluster` command as it simplifies many common usage cases.
18 This document describes both of these methods. We recommend that new users start with the :command:`ipcluster` command as it simplifies many common usage cases.
19
19
20 General considerations
20 General considerations
21 ======================
21 ======================
22
22
23 Before delving into the details about how you can start a controller and engines using the various methods, we outline some of the general issues that come up when starting the controller and engines. These things come up no matter which method you use to start your IPython cluster.
23 Before delving into the details about how you can start a controller and engines using the various methods, we outline some of the general issues that come up when starting the controller and engines. These things come up no matter which method you use to start your IPython cluster.
24
24
25 Let's say that you want to start the controller on ``host0`` and engines on hosts ``host1``-``hostn``. The following steps are then required:
25 Let's say that you want to start the controller on ``host0`` and engines on hosts ``host1``-``hostn``. The following steps are then required:
26
26
27 1. Start the controller on ``host0`` by running :command:`ipcontroller` on
27 1. Start the controller on ``host0`` by running :command:`ipcontroller` on
28 ``host0``.
28 ``host0``.
29 2. Move the FURL file (:file:`ipcontroller-engine.furl`) created by the
29 2. Move the FURL file (:file:`ipcontroller-engine.furl`) created by the
30 controller from ``host0`` to hosts ``host1``-``hostn``.
30 controller from ``host0`` to hosts ``host1``-``hostn``.
31 3. Start the engines on hosts ``host1``-``hostn`` by running
31 3. Start the engines on hosts ``host1``-``hostn`` by running
32 :command:`ipengine`. This command has to be told where the FURL file
32 :command:`ipengine`. This command has to be told where the FURL file
33 (:file:`ipcontroller-engine.furl`) is located.
33 (:file:`ipcontroller-engine.furl`) is located.
34
34
35 At this point, the controller and engines will be connected. By default, the
35 At this point, the controller and engines will be connected. By default, the
36 FURL files created by the controller are put into the
36 FURL files created by the controller are put into the
37 :file:`~/.ipython/security` directory. If the engines share a filesystem with
37 :file:`~/.ipython/security` directory. If the engines share a filesystem with
38 the controller, step 2 can be skipped as the engines will automatically look
38 the controller, step 2 can be skipped as the engines will automatically look
39 at that location.
39 at that location.
40
40
41 The final step required required to actually use the running controller from a
41 The final step required required to actually use the running controller from a
42 client is to move the FURL files :file:`ipcontroller-mec.furl` and
42 client is to move the FURL files :file:`ipcontroller-mec.furl` and
43 :file:`ipcontroller-tc.furl` from ``host0`` to the host where the clients will
43 :file:`ipcontroller-tc.furl` from ``host0`` to the host where the clients will
44 be run. If these file are put into the :file:`~/.ipython/security` directory of the client's host, they will be found automatically. Otherwise, the full path to them has to be passed to the client's constructor.
44 be run. If these file are put into the :file:`~/.ipython/security` directory of the client's host, they will be found automatically. Otherwise, the full path to them has to be passed to the client's constructor.
45
45
46 Using :command:`ipcluster`
46 Using :command:`ipcluster`
47 ==========================
47 ==========================
48
48
49 The :command:`ipcluster` command provides a simple way of starting a controller and engines in the following situations:
49 The :command:`ipcluster` command provides a simple way of starting a controller and engines in the following situations:
50
50
51 1. When the controller and engines are all run on localhost. This is useful
51 1. When the controller and engines are all run on localhost. This is useful
52 for testing or running on a multicore computer.
52 for testing or running on a multicore computer.
53 2. When engines are started using the :command:`mpirun` command that comes
53 2. When engines are started using the :command:`mpirun` command that comes
54 with most MPI [MPI]_ implementations
54 with most MPI [MPI]_ implementations
55 3. When engines are started using the PBS [PBS]_ batch system.
55 3. When engines are started using the PBS [PBS]_ batch system.
56 4. When the controller is started on localhost and the engines are started on
57 remote nodes using :command:`ssh`.
56
58
57 .. note::
59 .. note::
58
60
59 It is also possible for advanced users to add support to
61 It is also possible for advanced users to add support to
60 :command:`ipcluster` for starting controllers and engines using other
62 :command:`ipcluster` for starting controllers and engines using other
61 methods (like Sun's Grid Engine for example).
63 methods (like Sun's Grid Engine for example).
62
64
63 .. note::
65 .. note::
64
66
65 Currently :command:`ipcluster` requires that the
67 Currently :command:`ipcluster` requires that the
66 :file:`~/.ipython/security` directory live on a shared filesystem that is
68 :file:`~/.ipython/security` directory live on a shared filesystem that is
67 seen by both the controller and engines. If you don't have a shared file
69 seen by both the controller and engines. If you don't have a shared file
68 system you will need to use :command:`ipcontroller` and
70 system you will need to use :command:`ipcontroller` and
69 :command:`ipengine` directly.
71 :command:`ipengine` directly. This constraint can be relaxed if you are
72 using the :command:`ssh` method to start the cluster.
70
73
71 Underneath the hood, :command:`ipcluster` just uses :command:`ipcontroller`
74 Underneath the hood, :command:`ipcluster` just uses :command:`ipcontroller`
72 and :command:`ipengine` to perform the steps described above.
75 and :command:`ipengine` to perform the steps described above.
73
76
74 Using :command:`ipcluster` in local mode
77 Using :command:`ipcluster` in local mode
75 ----------------------------------------
78 ----------------------------------------
76
79
77 To start one controller and 4 engines on localhost, just do::
80 To start one controller and 4 engines on localhost, just do::
78
81
79 $ ipcluster local -n 4
82 $ ipcluster local -n 4
80
83
81 To see other command line options for the local mode, do::
84 To see other command line options for the local mode, do::
82
85
83 $ ipcluster local -h
86 $ ipcluster local -h
84
87
85 Using :command:`ipcluster` in mpirun mode
88 Using :command:`ipcluster` in mpirun mode
86 -----------------------------------------
89 -----------------------------------------
87
90
88 The mpirun mode is useful if you:
91 The mpirun mode is useful if you:
89
92
90 1. Have MPI installed.
93 1. Have MPI installed.
91 2. Your systems are configured to use the :command:`mpirun` command to start
94 2. Your systems are configured to use the :command:`mpirun` command to start
92 processes.
95 processes.
93
96
94 If these are satisfied, you can start an IPython cluster using::
97 If these are satisfied, you can start an IPython cluster using::
95
98
96 $ ipcluster mpirun -n 4
99 $ ipcluster mpirun -n 4
97
100
98 This does the following:
101 This does the following:
99
102
100 1. Starts the IPython controller on current host.
103 1. Starts the IPython controller on current host.
101 2. Uses :command:`mpirun` to start 4 engines.
104 2. Uses :command:`mpirun` to start 4 engines.
102
105
103 On newer MPI implementations (such as OpenMPI), this will work even if you don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI implementations actually require each process to call :func:`MPI_Init` upon starting. The easiest way of having this done is to install the mpi4py [mpi4py]_ package and then call ipcluster with the ``--mpi`` option::
106 On newer MPI implementations (such as OpenMPI), this will work even if you don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI implementations actually require each process to call :func:`MPI_Init` upon starting. The easiest way of having this done is to install the mpi4py [mpi4py]_ package and then call ipcluster with the ``--mpi`` option::
104
107
105 $ ipcluster mpirun -n 4 --mpi=mpi4py
108 $ ipcluster mpirun -n 4 --mpi=mpi4py
106
109
107 Unfortunately, even this won't work for some MPI implementations. If you are having problems with this, you will likely have to use a custom Python executable that itself calls :func:`MPI_Init` at the appropriate time. Fortunately, mpi4py comes with such a custom Python executable that is easy to install and use. However, this custom Python executable approach will not work with :command:`ipcluster` currently.
110 Unfortunately, even this won't work for some MPI implementations. If you are having problems with this, you will likely have to use a custom Python executable that itself calls :func:`MPI_Init` at the appropriate time. Fortunately, mpi4py comes with such a custom Python executable that is easy to install and use. However, this custom Python executable approach will not work with :command:`ipcluster` currently.
108
111
109 Additional command line options for this mode can be found by doing::
112 Additional command line options for this mode can be found by doing::
110
113
111 $ ipcluster mpirun -h
114 $ ipcluster mpirun -h
112
115
113 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
116 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
114
117
115
118
116 Using :command:`ipcluster` in PBS mode
119 Using :command:`ipcluster` in PBS mode
117 --------------------------------------
120 --------------------------------------
118
121
119 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To use this mode, you first need to create a PBS script template that will be used to start the engines. Here is a sample PBS script template:
122 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To use this mode, you first need to create a PBS script template that will be used to start the engines. Here is a sample PBS script template:
120
123
121 .. sourcecode:: bash
124 .. sourcecode:: bash
122
125
123 #PBS -N ipython
126 #PBS -N ipython
124 #PBS -j oe
127 #PBS -j oe
125 #PBS -l walltime=00:10:00
128 #PBS -l walltime=00:10:00
126 #PBS -l nodes=${n/4}:ppn=4
129 #PBS -l nodes=${n/4}:ppn=4
127 #PBS -q parallel
130 #PBS -q parallel
128
131
129 cd $$PBS_O_WORKDIR
132 cd $$PBS_O_WORKDIR
130 export PATH=$$HOME/usr/local/bin
133 export PATH=$$HOME/usr/local/bin
131 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
134 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
132 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
135 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
133
136
134 There are a few important points about this template:
137 There are a few important points about this template:
135
138
136 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
139 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
137 template engine.
140 template engine.
138
141
139 2. Instead of putting in the actual number of engines, use the notation
142 2. Instead of putting in the actual number of engines, use the notation
140 ``${n}`` to indicate the number of engines to be started. You can also uses
143 ``${n}`` to indicate the number of engines to be started. You can also uses
141 expressions like ``${n/4}`` in the template to indicate the number of
144 expressions like ``${n/4}`` in the template to indicate the number of
142 nodes.
145 nodes.
143
146
144 3. Because ``$`` is a special character used by the template engine, you must
147 3. Because ``$`` is a special character used by the template engine, you must
145 escape any ``$`` by using ``$$``. This is important when referring to
148 escape any ``$`` by using ``$$``. This is important when referring to
146 environment variables in the template.
149 environment variables in the template.
147
150
148 4. Any options to :command:`ipengine` should be given in the batch script
151 4. Any options to :command:`ipengine` should be given in the batch script
149 template.
152 template.
150
153
151 5. Depending on the configuration of you system, you may have to set
154 5. Depending on the configuration of you system, you may have to set
152 environment variables in the script template.
155 environment variables in the script template.
153
156
154 Once you have created such a script, save it with a name like :file:`pbs.template`. Now you are ready to start your job::
157 Once you have created such a script, save it with a name like :file:`pbs.template`. Now you are ready to start your job::
155
158
156 $ ipcluster pbs -n 128 --pbs-script=pbs.template
159 $ ipcluster pbs -n 128 --pbs-script=pbs.template
157
160
158 Additional command line options for this mode can be found by doing::
161 Additional command line options for this mode can be found by doing::
159
162
160 $ ipcluster pbs -h
163 $ ipcluster pbs -h
161
164
165 Using :command:`ipcluster` in SSH mode
166 --------------------------------------
167
168 The SSH mode uses :command:`ssh` to execute :command:`ipengine` on remote
169 nodes and the :command:`ipcontroller` on localhost.
170
171 When using using this mode it highly recommended that you have set up SSH keys and are using ssh-agent [SSH]_ for password-less logins.
172
173 To use this mode you need a python file describing the cluster, here is an example of such a "clusterfile":
174
175 .. sourcecode:: python
176
177 send_furl = True
178 engines = { 'host1.example.com' : 2,
179 'host2.example.com' : 5,
180 'host3.example.com' : 1,
181 'host4.example.com' : 8 }
182
183 Since this is a regular python file usual python syntax applies. Things to note:
184
185 * The `engines` dict, where the keys is the host we want to run engines on and
186 the value is the number of engines to run on that host.
187 * send_furl can either be `True` or `False`, if `True` it will copy over the
188 furl needed for :command:`ipengine` to each host.
189
190 The ``--clusterfile`` command line option lets you specify the file to use for
191 the cluster definition. Once you have your cluster file and you can
192 :command:`ssh` into the remote hosts with out an password you are ready to
193 start your cluster like so:
194
195 .. sourcecode:: bash
196
197 $ ipcluster ssh --clusterfile /path/to/my/clusterfile.py
198
199
200 Two helper shell scripts are used to start and stop :command:`ipengine` on remote hosts:
201
202 * sshx.sh
203 * engine_killer.sh
204
205 Both are provided in the :dir:`IPython.kernel.scripts`. They are copied to a
206 temp directory on the remote host and executed from there, on most Unix, Linux
207 and OS X systems this is /tmp.
208
209 The sshx.sh is as simple as:
210
211 .. sourcecode:: bash
212
213 #!/bin/sh
214 "$@" &> /dev/null &
215 echo $!
216
217 If you want to use a custom sshx.sh script you need to use the ``--sshx``
218 option and specify the file to use. Using a custom sshx.sh file could be
219 helpful when you need to setup the environment on the remote host before
220 executing :command:`ipengine`.
221
222 For a detailed options list:
223
224 .. sourcecode:: bash
225
226 $ ipcluster ssh -h
227
228 Current limitations of the SSH mode of :command:`ipcluster` are:
229
230 * Untested on Windows. Would require a working :command:`ssh` on Windows.
231 Also, we are using shell scripts to setup and execute commands on remote
232 hosts.
233 * :command:`ipcontroller` is started on localhost, with no option to start it
234 on a remote node also.
235
162 Using the :command:`ipcontroller` and :command:`ipengine` commands
236 Using the :command:`ipcontroller` and :command:`ipengine` commands
163 ==================================================================
237 ==================================================================
164
238
165 It is also possible to use the :command:`ipcontroller` and :command:`ipengine` commands to start your controller and engines. This approach gives you full control over all aspects of the startup process.
239 It is also possible to use the :command:`ipcontroller` and :command:`ipengine` commands to start your controller and engines. This approach gives you full control over all aspects of the startup process.
166
240
167 Starting the controller and engine on your local machine
241 Starting the controller and engine on your local machine
168 --------------------------------------------------------
242 --------------------------------------------------------
169
243
170 To use :command:`ipcontroller` and :command:`ipengine` to start things on your
244 To use :command:`ipcontroller` and :command:`ipengine` to start things on your
171 local machine, do the following.
245 local machine, do the following.
172
246
173 First start the controller::
247 First start the controller::
174
248
175 $ ipcontroller
249 $ ipcontroller
176
250
177 Next, start however many instances of the engine you want using (repeatedly) the command::
251 Next, start however many instances of the engine you want using (repeatedly) the command::
178
252
179 $ ipengine
253 $ ipengine
180
254
181 The engines should start and automatically connect to the controller using the FURL files in :file:`~./ipython/security`. You are now ready to use the controller and engines from IPython.
255 The engines should start and automatically connect to the controller using the FURL files in :file:`~./ipython/security`. You are now ready to use the controller and engines from IPython.
182
256
183 .. warning::
257 .. warning::
184
258
185 The order of the above operations is very important. You *must*
259 The order of the above operations is very important. You *must*
186 start the controller before the engines, since the engines connect
260 start the controller before the engines, since the engines connect
187 to the controller as they get started.
261 to the controller as they get started.
188
262
189 .. note::
263 .. note::
190
264
191 On some platforms (OS X), to put the controller and engine into the
265 On some platforms (OS X), to put the controller and engine into the
192 background you may need to give these commands in the form ``(ipcontroller
266 background you may need to give these commands in the form ``(ipcontroller
193 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
267 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
194 properly.
268 properly.
195
269
196 Starting the controller and engines on different hosts
270 Starting the controller and engines on different hosts
197 ------------------------------------------------------
271 ------------------------------------------------------
198
272
199 When the controller and engines are running on different hosts, things are
273 When the controller and engines are running on different hosts, things are
200 slightly more complicated, but the underlying ideas are the same:
274 slightly more complicated, but the underlying ideas are the same:
201
275
202 1. Start the controller on a host using :command:`ipcontroller`.
276 1. Start the controller on a host using :command:`ipcontroller`.
203 2. Copy :file:`ipcontroller-engine.furl` from :file:`~./ipython/security` on the controller's host to the host where the engines will run.
277 2. Copy :file:`ipcontroller-engine.furl` from :file:`~./ipython/security` on the controller's host to the host where the engines will run.
204 3. Use :command:`ipengine` on the engine's hosts to start the engines.
278 3. Use :command:`ipengine` on the engine's hosts to start the engines.
205
279
206 The only thing you have to be careful of is to tell :command:`ipengine` where the :file:`ipcontroller-engine.furl` file is located. There are two ways you can do this:
280 The only thing you have to be careful of is to tell :command:`ipengine` where the :file:`ipcontroller-engine.furl` file is located. There are two ways you can do this:
207
281
208 * Put :file:`ipcontroller-engine.furl` in the :file:`~./ipython/security`
282 * Put :file:`ipcontroller-engine.furl` in the :file:`~./ipython/security`
209 directory on the engine's host, where it will be found automatically.
283 directory on the engine's host, where it will be found automatically.
210 * Call :command:`ipengine` with the ``--furl-file=full_path_to_the_file``
284 * Call :command:`ipengine` with the ``--furl-file=full_path_to_the_file``
211 flag.
285 flag.
212
286
213 The ``--furl-file`` flag works like this::
287 The ``--furl-file`` flag works like this::
214
288
215 $ ipengine --furl-file=/path/to/my/ipcontroller-engine.furl
289 $ ipengine --furl-file=/path/to/my/ipcontroller-engine.furl
216
290
217 .. note::
291 .. note::
218
292
219 If the controller's and engine's hosts all have a shared file system
293 If the controller's and engine's hosts all have a shared file system
220 (:file:`~./ipython/security` is the same on all of them), then things
294 (:file:`~./ipython/security` is the same on all of them), then things
221 will just work!
295 will just work!
222
296
223 Make FURL files persistent
297 Make FURL files persistent
224 ---------------------------
298 ---------------------------
225
299
226 At fist glance it may seem that that managing the FURL files is a bit annoying. Going back to the house and key analogy, copying the FURL around each time you start the controller is like having to make a new key every time you want to unlock the door and enter your house. As with your house, you want to be able to create the key (or FURL file) once, and then simply use it at any point in the future.
300 At fist glance it may seem that that managing the FURL files is a bit annoying. Going back to the house and key analogy, copying the FURL around each time you start the controller is like having to make a new key every time you want to unlock the door and enter your house. As with your house, you want to be able to create the key (or FURL file) once, and then simply use it at any point in the future.
227
301
228 This is possible. The only thing you have to do is decide what ports the controller will listen on for the engines and clients. This is done as follows::
302 This is possible. The only thing you have to do is decide what ports the controller will listen on for the engines and clients. This is done as follows::
229
303
230 $ ipcontroller -r --client-port=10101 --engine-port=10102
304 $ ipcontroller -r --client-port=10101 --engine-port=10102
231
305
232 Then, just copy the furl files over the first time and you are set. You can start and stop the controller and engines any many times as you want in the future, just make sure to tell the controller to use the *same* ports.
306 Then, just copy the furl files over the first time and you are set. You can start and stop the controller and engines any many times as you want in the future, just make sure to tell the controller to use the *same* ports.
233
307
234 .. note::
308 .. note::
235
309
236 You may ask the question: what ports does the controller listen on if you
310 You may ask the question: what ports does the controller listen on if you
237 don't tell is to use specific ones? The default is to use high random port
311 don't tell is to use specific ones? The default is to use high random port
238 numbers. We do this for two reasons: i) to increase security through
312 numbers. We do this for two reasons: i) to increase security through
239 obscurity and ii) to multiple controllers on a given host to start and
313 obscurity and ii) to multiple controllers on a given host to start and
240 automatically use different ports.
314 automatically use different ports.
241
315
242 Log files
316 Log files
243 ---------
317 ---------
244
318
245 All of the components of IPython have log files associated with them.
319 All of the components of IPython have log files associated with them.
246 These log files can be extremely useful in debugging problems with
320 These log files can be extremely useful in debugging problems with
247 IPython and can be found in the directory :file:`~/.ipython/log`. Sending
321 IPython and can be found in the directory :file:`~/.ipython/log`. Sending
248 the log files to us will often help us to debug any problems.
322 the log files to us will often help us to debug any problems.
249
323
250
324
251 .. [PBS] Portable Batch System. http://www.openpbs.org/
325 .. [PBS] Portable Batch System. http://www.openpbs.org/
326 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
General Comments 0
You need to be logged in to leave comments. Login now