##// END OF EJS Templates
Merging vvatsa's ipcluster-dev branch....
Brian Granger -
r1833:e4b173fe merge
parent child Browse files
Show More
1 NO CONTENT: modified file
1 NO CONTENT: modified file
@@ -1,521 +1,723 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3
4 4 """Start an IPython cluster = (controller + engines)."""
5 5
6 6 #-----------------------------------------------------------------------------
7 7 # Copyright (C) 2008 The IPython Development Team
8 8 #
9 9 # Distributed under the terms of the BSD License. The full license is in
10 10 # the file COPYING, distributed as part of this software.
11 11 #-----------------------------------------------------------------------------
12 12
13 13 #-----------------------------------------------------------------------------
14 14 # Imports
15 15 #-----------------------------------------------------------------------------
16 16
17 17 import os
18 18 import re
19 19 import sys
20 20 import signal
21 import tempfile
21 22 pjoin = os.path.join
22 23
23 24 from twisted.internet import reactor, defer
24 25 from twisted.internet.protocol import ProcessProtocol
25 26 from twisted.internet.error import ProcessDone, ProcessTerminated
26 27 from twisted.internet.utils import getProcessOutput
27 28 from twisted.python import failure, log
28 29
29 30 from IPython.external import argparse
30 31 from IPython.external import Itpl
31 32 from IPython.genutils import get_ipython_dir, num_cpus
32 33 from IPython.kernel.fcutil import have_crypto
33 34 from IPython.kernel.error import SecurityError
34 35 from IPython.kernel.fcutil import have_crypto
35 36 from IPython.kernel.twistedutil import gatherBoth
36 37 from IPython.kernel.util import printer
37 38
38 39
39 40 #-----------------------------------------------------------------------------
40 41 # General process handling code
41 42 #-----------------------------------------------------------------------------
42 43
43 44 def find_exe(cmd):
44 45 try:
45 46 import win32api
46 47 except ImportError:
47 48 raise ImportError('you need to have pywin32 installed for this to work')
48 49 else:
49 50 try:
50 51 (path, offest) = win32api.SearchPath(os.environ['PATH'],cmd + '.exe')
51 52 except:
52 53 (path, offset) = win32api.SearchPath(os.environ['PATH'],cmd + '.bat')
53 54 return path
54 55
55 56 class ProcessStateError(Exception):
56 57 pass
57 58
58 59 class UnknownStatus(Exception):
59 60 pass
60 61
61 62 class LauncherProcessProtocol(ProcessProtocol):
62 63 """
63 64 A ProcessProtocol to go with the ProcessLauncher.
64 65 """
65 66 def __init__(self, process_launcher):
66 67 self.process_launcher = process_launcher
67 68
68 69 def connectionMade(self):
69 70 self.process_launcher.fire_start_deferred(self.transport.pid)
70 71
71 72 def processEnded(self, status):
72 73 value = status.value
73 74 if isinstance(value, ProcessDone):
74 75 self.process_launcher.fire_stop_deferred(0)
75 76 elif isinstance(value, ProcessTerminated):
76 77 self.process_launcher.fire_stop_deferred(
77 78 {'exit_code':value.exitCode,
78 79 'signal':value.signal,
79 80 'status':value.status
80 81 }
81 82 )
82 83 else:
83 84 raise UnknownStatus("unknown exit status, this is probably a bug in Twisted")
84 85
85 86 def outReceived(self, data):
86 87 log.msg(data)
87 88
88 89 def errReceived(self, data):
89 90 log.err(data)
90 91
91 92 class ProcessLauncher(object):
92 93 """
93 94 Start and stop an external process in an asynchronous manner.
94 95
95 96 Currently this uses deferreds to notify other parties of process state
96 97 changes. This is an awkward design and should be moved to using
97 98 a formal NotificationCenter.
98 99 """
99 100 def __init__(self, cmd_and_args):
100 101 self.cmd = cmd_and_args[0]
101 102 self.args = cmd_and_args
102 103 self._reset()
103 104
104 105 def _reset(self):
105 106 self.process_protocol = None
106 107 self.pid = None
107 108 self.start_deferred = None
108 109 self.stop_deferreds = []
109 110 self.state = 'before' # before, running, or after
110 111
111 112 @property
112 113 def running(self):
113 114 if self.state == 'running':
114 115 return True
115 116 else:
116 117 return False
117 118
118 119 def fire_start_deferred(self, pid):
119 120 self.pid = pid
120 121 self.state = 'running'
121 122 log.msg('Process %r has started with pid=%i' % (self.args, pid))
122 123 self.start_deferred.callback(pid)
123 124
124 125 def start(self):
125 126 if self.state == 'before':
126 127 self.process_protocol = LauncherProcessProtocol(self)
127 128 self.start_deferred = defer.Deferred()
128 129 self.process_transport = reactor.spawnProcess(
129 130 self.process_protocol,
130 131 self.cmd,
131 132 self.args,
132 133 env=os.environ
133 134 )
134 135 return self.start_deferred
135 136 else:
136 137 s = 'the process has already been started and has state: %r' % \
137 138 self.state
138 139 return defer.fail(ProcessStateError(s))
139 140
140 141 def get_stop_deferred(self):
141 142 if self.state == 'running' or self.state == 'before':
142 143 d = defer.Deferred()
143 144 self.stop_deferreds.append(d)
144 145 return d
145 146 else:
146 147 s = 'this process is already complete'
147 148 return defer.fail(ProcessStateError(s))
148 149
149 150 def fire_stop_deferred(self, exit_code):
150 151 log.msg('Process %r has stopped with %r' % (self.args, exit_code))
151 152 self.state = 'after'
152 153 for d in self.stop_deferreds:
153 154 d.callback(exit_code)
154 155
155 156 def signal(self, sig):
156 157 """
157 158 Send a signal to the process.
158 159
159 160 The argument sig can be ('KILL','INT', etc.) or any signal number.
160 161 """
161 162 if self.state == 'running':
162 163 self.process_transport.signalProcess(sig)
163 164
164 165 # def __del__(self):
165 166 # self.signal('KILL')
166 167
167 168 def interrupt_then_kill(self, delay=1.0):
168 169 self.signal('INT')
169 170 reactor.callLater(delay, self.signal, 'KILL')
170 171
171 172
172 173 #-----------------------------------------------------------------------------
173 174 # Code for launching controller and engines
174 175 #-----------------------------------------------------------------------------
175 176
176 177
177 178 class ControllerLauncher(ProcessLauncher):
178 179
179 180 def __init__(self, extra_args=None):
180 181 if sys.platform == 'win32':
181 182 # This logic is needed because the ipcontroller script doesn't
182 183 # always get installed in the same way or in the same location.
183 184 from IPython.kernel.scripts import ipcontroller
184 185 script_location = ipcontroller.__file__.replace('.pyc', '.py')
185 186 # The -u option here turns on unbuffered output, which is required
186 187 # on Win32 to prevent wierd conflict and problems with Twisted
187 188 args = [find_exe('python'), '-u', script_location]
188 189 else:
189 190 args = ['ipcontroller']
190 191 self.extra_args = extra_args
191 192 if extra_args is not None:
192 193 args.extend(extra_args)
193 194
194 195 ProcessLauncher.__init__(self, args)
195 196
196 197
197 198 class EngineLauncher(ProcessLauncher):
198 199
199 200 def __init__(self, extra_args=None):
200 201 if sys.platform == 'win32':
201 202 # This logic is needed because the ipcontroller script doesn't
202 203 # always get installed in the same way or in the same location.
203 204 from IPython.kernel.scripts import ipengine
204 205 script_location = ipengine.__file__.replace('.pyc', '.py')
205 206 # The -u option here turns on unbuffered output, which is required
206 207 # on Win32 to prevent wierd conflict and problems with Twisted
207 208 args = [find_exe('python'), '-u', script_location]
208 209 else:
209 210 args = ['ipengine']
210 211 self.extra_args = extra_args
211 212 if extra_args is not None:
212 213 args.extend(extra_args)
213 214
214 215 ProcessLauncher.__init__(self, args)
215 216
216 217
217 218 class LocalEngineSet(object):
218 219
219 220 def __init__(self, extra_args=None):
220 221 self.extra_args = extra_args
221 222 self.launchers = []
222 223
223 224 def start(self, n):
224 225 dlist = []
225 226 for i in range(n):
226 227 el = EngineLauncher(extra_args=self.extra_args)
227 228 d = el.start()
228 229 self.launchers.append(el)
229 230 dlist.append(d)
230 231 dfinal = gatherBoth(dlist, consumeErrors=True)
231 232 dfinal.addCallback(self._handle_start)
232 233 return dfinal
233 234
234 235 def _handle_start(self, r):
235 236 log.msg('Engines started with pids: %r' % r)
236 237 return r
237 238
238 239 def _handle_stop(self, r):
239 240 log.msg('Engines received signal: %r' % r)
240 241 return r
241 242
242 243 def signal(self, sig):
243 244 dlist = []
244 245 for el in self.launchers:
245 246 d = el.get_stop_deferred()
246 247 dlist.append(d)
247 248 el.signal(sig)
248 249 dfinal = gatherBoth(dlist, consumeErrors=True)
249 250 dfinal.addCallback(self._handle_stop)
250 251 return dfinal
251 252
252 253 def interrupt_then_kill(self, delay=1.0):
253 254 dlist = []
254 255 for el in self.launchers:
255 256 d = el.get_stop_deferred()
256 257 dlist.append(d)
257 258 el.interrupt_then_kill(delay)
258 259 dfinal = gatherBoth(dlist, consumeErrors=True)
259 260 dfinal.addCallback(self._handle_stop)
260 261 return dfinal
261 262
262 263
263 264 class BatchEngineSet(object):
264 265
265 266 # Subclasses must fill these in. See PBSEngineSet
266 267 submit_command = ''
267 268 delete_command = ''
268 269 job_id_regexp = ''
269 270
270 271 def __init__(self, template_file, **kwargs):
271 272 self.template_file = template_file
272 273 self.context = {}
273 274 self.context.update(kwargs)
274 275 self.batch_file = self.template_file+'-run'
275 276
276 277 def parse_job_id(self, output):
277 278 m = re.match(self.job_id_regexp, output)
278 279 if m is not None:
279 280 job_id = m.group()
280 281 else:
281 282 raise Exception("job id couldn't be determined: %s" % output)
282 283 self.job_id = job_id
283 284 log.msg('Job started with job id: %r' % job_id)
284 285 return job_id
285 286
286 287 def write_batch_script(self, n):
287 288 self.context['n'] = n
288 289 template = open(self.template_file, 'r').read()
289 290 log.msg('Using template for batch script: %s' % self.template_file)
290 291 script_as_string = Itpl.itplns(template, self.context)
291 292 log.msg('Writing instantiated batch script: %s' % self.batch_file)
292 293 f = open(self.batch_file,'w')
293 294 f.write(script_as_string)
294 295 f.close()
295 296
296 297 def handle_error(self, f):
297 298 f.printTraceback()
298 299 f.raiseException()
299 300
300 301 def start(self, n):
301 302 self.write_batch_script(n)
302 303 d = getProcessOutput(self.submit_command,
303 304 [self.batch_file],env=os.environ)
304 305 d.addCallback(self.parse_job_id)
305 306 d.addErrback(self.handle_error)
306 307 return d
307 308
308 309 def kill(self):
309 310 d = getProcessOutput(self.delete_command,
310 311 [self.job_id],env=os.environ)
311 312 return d
312 313
313 314 class PBSEngineSet(BatchEngineSet):
314 315
315 316 submit_command = 'qsub'
316 317 delete_command = 'qdel'
317 318 job_id_regexp = '\d+'
318 319
319 320 def __init__(self, template_file, **kwargs):
320 321 BatchEngineSet.__init__(self, template_file, **kwargs)
321 322
322 323
324 sshx_template="""#!/bin/sh
325 "$@" &> /dev/null &
326 echo $!
327 """
328
329 engine_killer_template="""#!/bin/sh
330 ps -fu `whoami` | grep '[i]pengine' | awk '{print $2}' | xargs kill -TERM
331 """
332
333 class SSHEngineSet(object):
334 sshx_template=sshx_template
335 engine_killer_template=engine_killer_template
336
337 def __init__(self, engine_hosts, sshx=None, ipengine="ipengine"):
338 """Start a controller on localhost and engines using ssh.
339
340 The engine_hosts argument is a dict with hostnames as keys and
341 the number of engine (int) as values. sshx is the name of a local
342 file that will be used to run remote commands. This file is used
343 to setup the environment properly.
344 """
345
346 self.temp_dir = tempfile.gettempdir()
347 if sshx is not None:
348 self.sshx = sshx
349 else:
350 # Write the sshx.sh file locally from our template.
351 self.sshx = os.path.join(
352 self.temp_dir,
353 '%s-main-sshx.sh' % os.environ['USER']
354 )
355 f = open(self.sshx, 'w')
356 f.writelines(self.sshx_template)
357 f.close()
358 self.engine_command = ipengine
359 self.engine_hosts = engine_hosts
360 # Write the engine killer script file locally from our template.
361 self.engine_killer = os.path.join(
362 self.temp_dir,
363 '%s-local-engine_killer.sh' % os.environ['USER']
364 )
365 f = open(self.engine_killer, 'w')
366 f.writelines(self.engine_killer_template)
367 f.close()
368
369 def start(self, send_furl=False):
370 dlist = []
371 for host in self.engine_hosts.keys():
372 count = self.engine_hosts[host]
373 d = self._start(host, count, send_furl)
374 dlist.append(d)
375 return gatherBoth(dlist, consumeErrors=True)
376
377 def _start(self, hostname, count=1, send_furl=False):
378 if send_furl:
379 d = self._scp_furl(hostname)
380 else:
381 d = defer.succeed(None)
382 d.addCallback(lambda r: self._scp_sshx(hostname))
383 d.addCallback(lambda r: self._ssh_engine(hostname, count))
384 return d
385
386 def _scp_furl(self, hostname):
387 scp_cmd = "scp ~/.ipython/security/ipcontroller-engine.furl %s:.ipython/security/" % (hostname)
388 cmd_list = scp_cmd.split()
389 cmd_list[1] = os.path.expanduser(cmd_list[1])
390 log.msg('Copying furl file: %s' % scp_cmd)
391 d = getProcessOutput(cmd_list[0], cmd_list[1:], env=os.environ)
392 return d
393
394 def _scp_sshx(self, hostname):
395 scp_cmd = "scp %s %s:%s/%s-sshx.sh" % (
396 self.sshx, hostname,
397 self.temp_dir, os.environ['USER']
398 )
399 print
400 log.msg("Copying sshx: %s" % scp_cmd)
401 sshx_scp = scp_cmd.split()
402 d = getProcessOutput(sshx_scp[0], sshx_scp[1:], env=os.environ)
403 return d
404
405 def _ssh_engine(self, hostname, count):
406 exec_engine = "ssh %s sh %s/%s-sshx.sh %s" % (
407 hostname, self.temp_dir,
408 os.environ['USER'], self.engine_command
409 )
410 cmds = exec_engine.split()
411 dlist = []
412 log.msg("about to start engines...")
413 for i in range(count):
414 log.msg('Starting engines: %s' % exec_engine)
415 d = getProcessOutput(cmds[0], cmds[1:], env=os.environ)
416 dlist.append(d)
417 return gatherBoth(dlist, consumeErrors=True)
418
419 def kill(self):
420 dlist = []
421 for host in self.engine_hosts.keys():
422 d = self._killall(host)
423 dlist.append(d)
424 return gatherBoth(dlist, consumeErrors=True)
425
426 def _killall(self, hostname):
427 d = self._scp_engine_killer(hostname)
428 d.addCallback(lambda r: self._ssh_kill(hostname))
429 # d.addErrback(self._exec_err)
430 return d
431
432 def _scp_engine_killer(self, hostname):
433 scp_cmd = "scp %s %s:%s/%s-engine_killer.sh" % (
434 self.engine_killer,
435 hostname,
436 self.temp_dir,
437 os.environ['USER']
438 )
439 cmds = scp_cmd.split()
440 log.msg('Copying engine_killer: %s' % scp_cmd)
441 d = getProcessOutput(cmds[0], cmds[1:], env=os.environ)
442 return d
443
444 def _ssh_kill(self, hostname):
445 kill_cmd = "ssh %s sh %s/%s-engine_killer.sh" % (
446 hostname,
447 self.temp_dir,
448 os.environ['USER']
449 )
450 log.msg('Killing engine: %s' % kill_cmd)
451 kill_cmd = kill_cmd.split()
452 d = getProcessOutput(kill_cmd[0], kill_cmd[1:], env=os.environ)
453 return d
454
455 def _exec_err(self, r):
456 log.msg(r)
457
323 458 #-----------------------------------------------------------------------------
324 459 # Main functions for the different types of clusters
325 460 #-----------------------------------------------------------------------------
326 461
327 462 # TODO:
328 463 # The logic in these codes should be moved into classes like LocalCluster
329 464 # MpirunCluster, PBSCluster, etc. This would remove alot of the duplications.
330 465 # The main functions should then just parse the command line arguments, create
331 466 # the appropriate class and call a 'start' method.
332 467
333 468 def check_security(args, cont_args):
334 469 if (not args.x or not args.y) and not have_crypto:
335 470 log.err("""
336 471 OpenSSL/pyOpenSSL is not available, so we can't run in secure mode.
337 472 Try running ipcluster with the -xy flags: ipcluster local -xy -n 4""")
338 473 reactor.stop()
339 474 return False
340 475 if args.x:
341 476 cont_args.append('-x')
342 477 if args.y:
343 478 cont_args.append('-y')
344 479 return True
345 480
481
346 482 def main_local(args):
347 483 cont_args = []
348 484 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
349 485
350 486 # Check security settings before proceeding
351 487 if not check_security(args, cont_args):
352 488 return
353 489
354 490 cl = ControllerLauncher(extra_args=cont_args)
355 491 dstart = cl.start()
356 492 def start_engines(cont_pid):
357 493 engine_args = []
358 494 engine_args.append('--logfile=%s' % \
359 495 pjoin(args.logdir,'ipengine%s-' % cont_pid))
360 496 eset = LocalEngineSet(extra_args=engine_args)
361 497 def shutdown(signum, frame):
362 498 log.msg('Stopping local cluster')
363 499 # We are still playing with the times here, but these seem
364 500 # to be reliable in allowing everything to exit cleanly.
365 501 eset.interrupt_then_kill(0.5)
366 502 cl.interrupt_then_kill(0.5)
367 503 reactor.callLater(1.0, reactor.stop)
368 504 signal.signal(signal.SIGINT,shutdown)
369 505 d = eset.start(args.n)
370 506 return d
371 507 def delay_start(cont_pid):
372 508 # This is needed because the controller doesn't start listening
373 509 # right when it starts and the controller needs to write
374 510 # furl files for the engine to pick up
375 511 reactor.callLater(1.0, start_engines, cont_pid)
376 512 dstart.addCallback(delay_start)
377 513 dstart.addErrback(lambda f: f.raiseException())
378 514
515
379 516 def main_mpirun(args):
380 517 cont_args = []
381 518 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
382 519
383 520 # Check security settings before proceeding
384 521 if not check_security(args, cont_args):
385 522 return
386 523
387 524 cl = ControllerLauncher(extra_args=cont_args)
388 525 dstart = cl.start()
389 526 def start_engines(cont_pid):
390 527 raw_args = ['mpirun']
391 528 raw_args.extend(['-n',str(args.n)])
392 529 raw_args.append('ipengine')
393 530 raw_args.append('-l')
394 531 raw_args.append(pjoin(args.logdir,'ipengine%s-' % cont_pid))
395 532 if args.mpi:
396 533 raw_args.append('--mpi=%s' % args.mpi)
397 534 eset = ProcessLauncher(raw_args)
398 535 def shutdown(signum, frame):
399 536 log.msg('Stopping local cluster')
400 537 # We are still playing with the times here, but these seem
401 538 # to be reliable in allowing everything to exit cleanly.
402 539 eset.interrupt_then_kill(1.0)
403 540 cl.interrupt_then_kill(1.0)
404 541 reactor.callLater(2.0, reactor.stop)
405 542 signal.signal(signal.SIGINT,shutdown)
406 543 d = eset.start()
407 544 return d
408 545 def delay_start(cont_pid):
409 546 # This is needed because the controller doesn't start listening
410 547 # right when it starts and the controller needs to write
411 548 # furl files for the engine to pick up
412 549 reactor.callLater(1.0, start_engines, cont_pid)
413 550 dstart.addCallback(delay_start)
414 551 dstart.addErrback(lambda f: f.raiseException())
415 552
553
416 554 def main_pbs(args):
417 555 cont_args = []
418 556 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
419 557
420 558 # Check security settings before proceeding
421 559 if not check_security(args, cont_args):
422 560 return
423 561
424 562 cl = ControllerLauncher(extra_args=cont_args)
425 563 dstart = cl.start()
426 564 def start_engines(r):
427 565 pbs_set = PBSEngineSet(args.pbsscript)
428 566 def shutdown(signum, frame):
429 567 log.msg('Stopping pbs cluster')
430 568 d = pbs_set.kill()
431 569 d.addBoth(lambda _: cl.interrupt_then_kill(1.0))
432 570 d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop))
433 571 signal.signal(signal.SIGINT,shutdown)
434 572 d = pbs_set.start(args.n)
435 573 return d
436 574 dstart.addCallback(start_engines)
437 575 dstart.addErrback(lambda f: f.raiseException())
438 576
439 577
578 def main_ssh(args):
579 """Start a controller on localhost and engines using ssh.
580
581 Your clusterfile should look like::
582
583 send_furl = False # True, if you want
584 engines = {
585 'engine_host1' : engine_count,
586 'engine_host2' : engine_count2
587 }
588 """
589 clusterfile = {}
590 execfile(args.clusterfile, clusterfile)
591 if not clusterfile.has_key('send_furl'):
592 clusterfile['send_furl'] = False
593
594 cont_args = []
595 cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller'))
596
597 # Check security settings before proceeding
598 if not check_security(args, cont_args):
599 return
600
601 cl = ControllerLauncher(extra_args=cont_args)
602 dstart = cl.start()
603 def start_engines(cont_pid):
604 ssh_set = SSHEngineSet(clusterfile['engines'], sshx=args.sshx)
605 def shutdown(signum, frame):
606 d = ssh_set.kill()
607 # d.addErrback(log.err)
608 cl.interrupt_then_kill(1.0)
609 reactor.callLater(2.0, reactor.stop)
610 signal.signal(signal.SIGINT,shutdown)
611 d = ssh_set.start(clusterfile['send_furl'])
612 return d
613
614 def delay_start(cont_pid):
615 reactor.callLater(1.0, start_engines, cont_pid)
616
617 dstart.addCallback(delay_start)
618 dstart.addErrback(lambda f: f.raiseException())
619
620
440 621 def get_args():
441 622 base_parser = argparse.ArgumentParser(add_help=False)
442 623 base_parser.add_argument(
443 624 '-x',
444 625 action='store_true',
445 626 dest='x',
446 627 help='turn off client security'
447 628 )
448 629 base_parser.add_argument(
449 630 '-y',
450 631 action='store_true',
451 632 dest='y',
452 633 help='turn off engine security'
453 634 )
454 635 base_parser.add_argument(
455 636 "--logdir",
456 637 type=str,
457 638 dest="logdir",
458 639 help="directory to put log files (default=$IPYTHONDIR/log)",
459 640 default=pjoin(get_ipython_dir(),'log')
460 641 )
461 642 base_parser.add_argument(
462 643 "-n",
463 644 "--num",
464 645 type=int,
465 646 dest="n",
466 647 default=2,
467 648 help="the number of engines to start"
468 649 )
469 650
470 651 parser = argparse.ArgumentParser(
471 652 description='IPython cluster startup. This starts a controller and\
472 653 engines using various approaches. THIS IS A TECHNOLOGY PREVIEW AND\
473 654 THE API WILL CHANGE SIGNIFICANTLY BEFORE THE FINAL RELEASE.'
474 655 )
475 656 subparsers = parser.add_subparsers(
476 657 help='available cluster types. For help, do "ipcluster TYPE --help"')
477 658
478 659 parser_local = subparsers.add_parser(
479 660 'local',
480 661 help='run a local cluster',
481 662 parents=[base_parser]
482 663 )
483 664 parser_local.set_defaults(func=main_local)
484 665
485 666 parser_mpirun = subparsers.add_parser(
486 667 'mpirun',
487 668 help='run a cluster using mpirun',
488 669 parents=[base_parser]
489 670 )
490 671 parser_mpirun.add_argument(
491 672 "--mpi",
492 673 type=str,
493 674 dest="mpi", # Don't put a default here to allow no MPI support
494 675 help="how to call MPI_Init (default=mpi4py)"
495 676 )
496 677 parser_mpirun.set_defaults(func=main_mpirun)
497 678
498 679 parser_pbs = subparsers.add_parser(
499 680 'pbs',
500 681 help='run a pbs cluster',
501 682 parents=[base_parser]
502 683 )
503 684 parser_pbs.add_argument(
504 685 '--pbs-script',
505 686 type=str,
506 687 dest='pbsscript',
507 688 help='PBS script template',
508 689 default='pbs.template'
509 690 )
510 691 parser_pbs.set_defaults(func=main_pbs)
692
693 parser_ssh = subparsers.add_parser(
694 'ssh',
695 help='run a cluster using ssh, should have ssh-keys setup',
696 parents=[base_parser]
697 )
698 parser_ssh.add_argument(
699 '--clusterfile',
700 type=str,
701 dest='clusterfile',
702 help='python file describing the cluster',
703 default='clusterfile.py',
704 )
705 parser_ssh.add_argument(
706 '--sshx',
707 type=str,
708 dest='sshx',
709 help='sshx launcher helper'
710 )
711 parser_ssh.set_defaults(func=main_ssh)
712
511 713 args = parser.parse_args()
512 714 return args
513 715
514 716 def main():
515 717 args = get_args()
516 718 reactor.callWhenRunning(args.func, args)
517 719 log.startLogging(sys.stdout)
518 720 reactor.run()
519 721
520 722 if __name__ == '__main__':
521 723 main()
@@ -1,393 +1,398 b''
1 1 .. _changes:
2 2
3 3 ==========
4 4 What's new
5 5 ==========
6 6
7 7 .. contents::
8 8 ..
9 9 1 Release 0.9.1
10 10 2 Release 0.9
11 11 2.1 New features
12 12 2.2 Bug fixes
13 13 2.3 Backwards incompatible changes
14 14 2.4 Changes merged in from IPython1
15 15 2.4.1 New features
16 16 2.4.2 Bug fixes
17 17 2.4.3 Backwards incompatible changes
18 18 3 Release 0.8.4
19 19 4 Release 0.8.3
20 20 5 Release 0.8.2
21 21 6 Older releases
22 22 ..
23 23
24 24 Release dev
25 25 ===========
26 26
27 27 New features
28 28 ------------
29 29
30 * The new ipcluster now has a fully working ssh mode that should work on
31 Linux, Unix and OS X. Thanks to Vishal Vatsa for implementing this!
32
30 33 * The wonderful TextMate editor can now be used with %edit on OS X. Thanks
31 34 to Matt Foster for this patch.
32 35
33 36 * Fully refactored :command:`ipcluster` command line program for starting
34 37 IPython clusters. This new version is a complete rewrite and 1) is fully
35 38 cross platform (we now use Twisted's process management), 2) has much
36 39 improved performance, 3) uses subcommands for different types of clusters,
37 40 4) uses argparse for parsing command line options, 5) has better support
38 41 for starting clusters using :command:`mpirun`, 6) has experimental support
39 42 for starting engines using PBS. However, this new version of ipcluster
40 43 should be considered a technology preview. We plan on changing the API
41 44 in significant ways before it is final.
42 45
43 46 * The :mod:`argparse` module has been added to :mod:`IPython.external`.
44 47
45 48 * Fully description of the security model added to the docs.
46 49
47 50 * cd completer: show bookmarks if no other completions are available.
48 51
49 52 * sh profile: easy way to give 'title' to prompt: assign to variable
50 53 '_prompt_title'. It looks like this::
51 54
52 55 [~]|1> _prompt_title = 'sudo!'
53 56 sudo![~]|2>
54 57
55 58 * %edit: If you do '%edit pasted_block', pasted_block
56 59 variable gets updated with new data (so repeated
57 60 editing makes sense)
58 61
59 62 Bug fixes
60 63 ---------
61 64
65 * Numerous bugs on Windows with the new ipcluster have been fixed.
66
62 67 * The ipengine and ipcontroller scripts now handle missing furl files
63 68 more gracefully by giving better error messages.
64 69
65 70 * %rehashx: Aliases no longer contain dots. python3.0 binary
66 71 will create alias python30. Fixes:
67 72 #259716 "commands with dots in them don't work"
68 73
69 74 * %cpaste: %cpaste -r repeats the last pasted block.
70 75 The block is assigned to pasted_block even if code
71 76 raises exception.
72 77
73 78 Backwards incompatible changes
74 79 ------------------------------
75 80
76 81 * The controller now has a ``-r`` flag that needs to be used if you want to
77 82 reuse existing furl files. Otherwise they are deleted (the default).
78 83
79 84 * Remove ipy_leo.py. "easy_install ipython-extension" to get it.
80 85 (done to decouple it from ipython release cycle)
81 86
82 87
83 88
84 89 Release 0.9.1
85 90 =============
86 91
87 92 This release was quickly made to restore compatibility with Python 2.4, which
88 93 version 0.9 accidentally broke. No new features were introduced, other than
89 94 some additional testing support for internal use.
90 95
91 96
92 97 Release 0.9
93 98 ===========
94 99
95 100 New features
96 101 ------------
97 102
98 103 * All furl files and security certificates are now put in a read-only
99 104 directory named ~./ipython/security.
100 105
101 106 * A single function :func:`get_ipython_dir`, in :mod:`IPython.genutils` that
102 107 determines the user's IPython directory in a robust manner.
103 108
104 109 * Laurent's WX application has been given a top-level script called
105 110 ipython-wx, and it has received numerous fixes. We expect this code to be
106 111 architecturally better integrated with Gael's WX 'ipython widget' over the
107 112 next few releases.
108 113
109 114 * The Editor synchronization work by Vivian De Smedt has been merged in. This
110 115 code adds a number of new editor hooks to synchronize with editors under
111 116 Windows.
112 117
113 118 * A new, still experimental but highly functional, WX shell by Gael Varoquaux.
114 119 This work was sponsored by Enthought, and while it's still very new, it is
115 120 based on a more cleanly organized arhictecture of the various IPython
116 121 components. We will continue to develop this over the next few releases as a
117 122 model for GUI components that use IPython.
118 123
119 124 * Another GUI frontend, Cocoa based (Cocoa is the OSX native GUI framework),
120 125 authored by Barry Wark. Currently the WX and the Cocoa ones have slightly
121 126 different internal organizations, but the whole team is working on finding
122 127 what the right abstraction points are for a unified codebase.
123 128
124 129 * As part of the frontend work, Barry Wark also implemented an experimental
125 130 event notification system that various ipython components can use. In the
126 131 next release the implications and use patterns of this system regarding the
127 132 various GUI options will be worked out.
128 133
129 134 * IPython finally has a full test system, that can test docstrings with
130 135 IPython-specific functionality. There are still a few pieces missing for it
131 136 to be widely accessible to all users (so they can run the test suite at any
132 137 time and report problems), but it now works for the developers. We are
133 138 working hard on continuing to improve it, as this was probably IPython's
134 139 major Achilles heel (the lack of proper test coverage made it effectively
135 140 impossible to do large-scale refactoring). The full test suite can now
136 141 be run using the :command:`iptest` command line program.
137 142
138 143 * The notion of a task has been completely reworked. An `ITask` interface has
139 144 been created. This interface defines the methods that tasks need to
140 145 implement. These methods are now responsible for things like submitting
141 146 tasks and processing results. There are two basic task types:
142 147 :class:`IPython.kernel.task.StringTask` (this is the old `Task` object, but
143 148 renamed) and the new :class:`IPython.kernel.task.MapTask`, which is based on
144 149 a function.
145 150
146 151 * A new interface, :class:`IPython.kernel.mapper.IMapper` has been defined to
147 152 standardize the idea of a `map` method. This interface has a single `map`
148 153 method that has the same syntax as the built-in `map`. We have also defined
149 154 a `mapper` factory interface that creates objects that implement
150 155 :class:`IPython.kernel.mapper.IMapper` for different controllers. Both the
151 156 multiengine and task controller now have mapping capabilties.
152 157
153 158 * The parallel function capabilities have been reworks. The major changes are
154 159 that i) there is now an `@parallel` magic that creates parallel functions,
155 160 ii) the syntax for mulitple variable follows that of `map`, iii) both the
156 161 multiengine and task controller now have a parallel function implementation.
157 162
158 163 * All of the parallel computing capabilities from `ipython1-dev` have been
159 164 merged into IPython proper. This resulted in the following new subpackages:
160 165 :mod:`IPython.kernel`, :mod:`IPython.kernel.core`, :mod:`IPython.config`,
161 166 :mod:`IPython.tools` and :mod:`IPython.testing`.
162 167
163 168 * As part of merging in the `ipython1-dev` stuff, the `setup.py` script and
164 169 friends have been completely refactored. Now we are checking for
165 170 dependencies using the approach that matplotlib uses.
166 171
167 172 * The documentation has been completely reorganized to accept the
168 173 documentation from `ipython1-dev`.
169 174
170 175 * We have switched to using Foolscap for all of our network protocols in
171 176 :mod:`IPython.kernel`. This gives us secure connections that are both
172 177 encrypted and authenticated.
173 178
174 179 * We have a brand new `COPYING.txt` files that describes the IPython license
175 180 and copyright. The biggest change is that we are putting "The IPython
176 181 Development Team" as the copyright holder. We give more details about
177 182 exactly what this means in this file. All developer should read this and use
178 183 the new banner in all IPython source code files.
179 184
180 185 * sh profile: ./foo runs foo as system command, no need to do !./foo anymore
181 186
182 187 * String lists now support ``sort(field, nums = True)`` method (to easily sort
183 188 system command output). Try it with ``a = !ls -l ; a.sort(1, nums=1)``.
184 189
185 190 * '%cpaste foo' now assigns the pasted block as string list, instead of string
186 191
187 192 * The ipcluster script now run by default with no security. This is done
188 193 because the main usage of the script is for starting things on localhost.
189 194 Eventually when ipcluster is able to start things on other hosts, we will put
190 195 security back.
191 196
192 197 * 'cd --foo' searches directory history for string foo, and jumps to that dir.
193 198 Last part of dir name is checked first. If no matches for that are found,
194 199 look at the whole path.
195 200
196 201
197 202 Bug fixes
198 203 ---------
199 204
200 205 * The Windows installer has been fixed. Now all IPython scripts have ``.bat``
201 206 versions created. Also, the Start Menu shortcuts have been updated.
202 207
203 208 * The colors escapes in the multiengine client are now turned off on win32 as
204 209 they don't print correctly.
205 210
206 211 * The :mod:`IPython.kernel.scripts.ipengine` script was exec'ing
207 212 mpi_import_statement incorrectly, which was leading the engine to crash when
208 213 mpi was enabled.
209 214
210 215 * A few subpackages had missing ``__init__.py`` files.
211 216
212 217 * The documentation is only created if Sphinx is found. Previously, the
213 218 ``setup.py`` script would fail if it was missing.
214 219
215 220 * Greedy ``cd`` completion has been disabled again (it was enabled in 0.8.4) as
216 221 it caused problems on certain platforms.
217 222
218 223
219 224 Backwards incompatible changes
220 225 ------------------------------
221 226
222 227 * The ``clusterfile`` options of the :command:`ipcluster` command has been
223 228 removed as it was not working and it will be replaced soon by something much
224 229 more robust.
225 230
226 231 * The :mod:`IPython.kernel` configuration now properly find the user's
227 232 IPython directory.
228 233
229 234 * In ipapi, the :func:`make_user_ns` function has been replaced with
230 235 :func:`make_user_namespaces`, to support dict subclasses in namespace
231 236 creation.
232 237
233 238 * :class:`IPython.kernel.client.Task` has been renamed
234 239 :class:`IPython.kernel.client.StringTask` to make way for new task types.
235 240
236 241 * The keyword argument `style` has been renamed `dist` in `scatter`, `gather`
237 242 and `map`.
238 243
239 244 * Renamed the values that the rename `dist` keyword argument can have from
240 245 `'basic'` to `'b'`.
241 246
242 247 * IPython has a larger set of dependencies if you want all of its capabilities.
243 248 See the `setup.py` script for details.
244 249
245 250 * The constructors for :class:`IPython.kernel.client.MultiEngineClient` and
246 251 :class:`IPython.kernel.client.TaskClient` no longer take the (ip,port) tuple.
247 252 Instead they take the filename of a file that contains the FURL for that
248 253 client. If the FURL file is in your IPYTHONDIR, it will be found automatically
249 254 and the constructor can be left empty.
250 255
251 256 * The asynchronous clients in :mod:`IPython.kernel.asyncclient` are now created
252 257 using the factory functions :func:`get_multiengine_client` and
253 258 :func:`get_task_client`. These return a `Deferred` to the actual client.
254 259
255 260 * The command line options to `ipcontroller` and `ipengine` have changed to
256 261 reflect the new Foolscap network protocol and the FURL files. Please see the
257 262 help for these scripts for details.
258 263
259 264 * The configuration files for the kernel have changed because of the Foolscap
260 265 stuff. If you were using custom config files before, you should delete them
261 266 and regenerate new ones.
262 267
263 268 Changes merged in from IPython1
264 269 -------------------------------
265 270
266 271 New features
267 272 ............
268 273
269 274 * Much improved ``setup.py`` and ``setupegg.py`` scripts. Because Twisted and
270 275 zope.interface are now easy installable, we can declare them as dependencies
271 276 in our setupegg.py script.
272 277
273 278 * IPython is now compatible with Twisted 2.5.0 and 8.x.
274 279
275 280 * Added a new example of how to use :mod:`ipython1.kernel.asynclient`.
276 281
277 282 * Initial draft of a process daemon in :mod:`ipython1.daemon`. This has not
278 283 been merged into IPython and is still in `ipython1-dev`.
279 284
280 285 * The ``TaskController`` now has methods for getting the queue status.
281 286
282 287 * The ``TaskResult`` objects not have information about how long the task
283 288 took to run.
284 289
285 290 * We are attaching additional attributes to exceptions ``(_ipython_*)`` that
286 291 we use to carry additional info around.
287 292
288 293 * New top-level module :mod:`asyncclient` that has asynchronous versions (that
289 294 return deferreds) of the client classes. This is designed to users who want
290 295 to run their own Twisted reactor.
291 296
292 297 * All the clients in :mod:`client` are now based on Twisted. This is done by
293 298 running the Twisted reactor in a separate thread and using the
294 299 :func:`blockingCallFromThread` function that is in recent versions of Twisted.
295 300
296 301 * Functions can now be pushed/pulled to/from engines using
297 302 :meth:`MultiEngineClient.push_function` and
298 303 :meth:`MultiEngineClient.pull_function`.
299 304
300 305 * Gather/scatter are now implemented in the client to reduce the work load
301 306 of the controller and improve performance.
302 307
303 308 * Complete rewrite of the IPython docuementation. All of the documentation
304 309 from the IPython website has been moved into docs/source as restructured
305 310 text documents. PDF and HTML documentation are being generated using
306 311 Sphinx.
307 312
308 313 * New developer oriented documentation: development guidelines and roadmap.
309 314
310 315 * Traditional ``ChangeLog`` has been changed to a more useful ``changes.txt``
311 316 file that is organized by release and is meant to provide something more
312 317 relevant for users.
313 318
314 319 Bug fixes
315 320 .........
316 321
317 322 * Created a proper ``MANIFEST.in`` file to create source distributions.
318 323
319 324 * Fixed a bug in the ``MultiEngine`` interface. Previously, multi-engine
320 325 actions were being collected with a :class:`DeferredList` with
321 326 ``fireononeerrback=1``. This meant that methods were returning
322 327 before all engines had given their results. This was causing extremely odd
323 328 bugs in certain cases. To fix this problem, we have 1) set
324 329 ``fireononeerrback=0`` to make sure all results (or exceptions) are in
325 330 before returning and 2) introduced a :exc:`CompositeError` exception
326 331 that wraps all of the engine exceptions. This is a huge change as it means
327 332 that users will have to catch :exc:`CompositeError` rather than the actual
328 333 exception.
329 334
330 335 Backwards incompatible changes
331 336 ..............................
332 337
333 338 * All names have been renamed to conform to the lowercase_with_underscore
334 339 convention. This will require users to change references to all names like
335 340 ``queueStatus`` to ``queue_status``.
336 341
337 342 * Previously, methods like :meth:`MultiEngineClient.push` and
338 343 :meth:`MultiEngineClient.push` used ``*args`` and ``**kwargs``. This was
339 344 becoming a problem as we weren't able to introduce new keyword arguments into
340 345 the API. Now these methods simple take a dict or sequence. This has also
341 346 allowed us to get rid of the ``*All`` methods like :meth:`pushAll` and
342 347 :meth:`pullAll`. These things are now handled with the ``targets`` keyword
343 348 argument that defaults to ``'all'``.
344 349
345 350 * The :attr:`MultiEngineClient.magicTargets` has been renamed to
346 351 :attr:`MultiEngineClient.targets`.
347 352
348 353 * All methods in the MultiEngine interface now accept the optional keyword
349 354 argument ``block``.
350 355
351 356 * Renamed :class:`RemoteController` to :class:`MultiEngineClient` and
352 357 :class:`TaskController` to :class:`TaskClient`.
353 358
354 359 * Renamed the top-level module from :mod:`api` to :mod:`client`.
355 360
356 361 * Most methods in the multiengine interface now raise a :exc:`CompositeError`
357 362 exception that wraps the user's exceptions, rather than just raising the raw
358 363 user's exception.
359 364
360 365 * Changed the ``setupNS`` and ``resultNames`` in the ``Task`` class to ``push``
361 366 and ``pull``.
362 367
363 368
364 369 Release 0.8.4
365 370 =============
366 371
367 372 This was a quick release to fix an unfortunate bug that slipped into the 0.8.3
368 373 release. The ``--twisted`` option was disabled, as it turned out to be broken
369 374 across several platforms.
370 375
371 376
372 377 Release 0.8.3
373 378 =============
374 379
375 380 * pydb is now disabled by default (due to %run -d problems). You can enable
376 381 it by passing -pydb command line argument to IPython. Note that setting
377 382 it in config file won't work.
378 383
379 384
380 385 Release 0.8.2
381 386 =============
382 387
383 388 * %pushd/%popd behave differently; now "pushd /foo" pushes CURRENT directory
384 389 and jumps to /foo. The current behaviour is closer to the documented
385 390 behaviour, and should not trip anyone.
386 391
387 392
388 393 Older releases
389 394 ==============
390 395
391 396 Changes in earlier releases of IPython are described in the older file
392 397 ``ChangeLog``. Please refer to this document for details.
393 398
@@ -1,251 +1,324 b''
1 1 .. _parallel_process:
2 2
3 3 ===========================================
4 4 Starting the IPython controller and engines
5 5 ===========================================
6 6
7 7 To use IPython for parallel computing, you need to start one instance of
8 8 the controller and one or more instances of the engine. The controller
9 9 and each engine can run on different machines or on the same machine.
10 10 Because of this, there are many different possibilities.
11 11
12 12 Broadly speaking, there are two ways of going about starting a controller and engines:
13 13
14 14 * In an automated manner using the :command:`ipcluster` command.
15 15 * In a more manual way using the :command:`ipcontroller` and
16 16 :command:`ipengine` commands.
17 17
18 18 This document describes both of these methods. We recommend that new users start with the :command:`ipcluster` command as it simplifies many common usage cases.
19 19
20 20 General considerations
21 21 ======================
22 22
23 23 Before delving into the details about how you can start a controller and engines using the various methods, we outline some of the general issues that come up when starting the controller and engines. These things come up no matter which method you use to start your IPython cluster.
24 24
25 25 Let's say that you want to start the controller on ``host0`` and engines on hosts ``host1``-``hostn``. The following steps are then required:
26 26
27 27 1. Start the controller on ``host0`` by running :command:`ipcontroller` on
28 28 ``host0``.
29 29 2. Move the FURL file (:file:`ipcontroller-engine.furl`) created by the
30 30 controller from ``host0`` to hosts ``host1``-``hostn``.
31 31 3. Start the engines on hosts ``host1``-``hostn`` by running
32 32 :command:`ipengine`. This command has to be told where the FURL file
33 33 (:file:`ipcontroller-engine.furl`) is located.
34 34
35 35 At this point, the controller and engines will be connected. By default, the
36 36 FURL files created by the controller are put into the
37 37 :file:`~/.ipython/security` directory. If the engines share a filesystem with
38 38 the controller, step 2 can be skipped as the engines will automatically look
39 39 at that location.
40 40
41 41 The final step required required to actually use the running controller from a
42 42 client is to move the FURL files :file:`ipcontroller-mec.furl` and
43 43 :file:`ipcontroller-tc.furl` from ``host0`` to the host where the clients will
44 44 be run. If these file are put into the :file:`~/.ipython/security` directory of the client's host, they will be found automatically. Otherwise, the full path to them has to be passed to the client's constructor.
45 45
46 46 Using :command:`ipcluster`
47 47 ==========================
48 48
49 49 The :command:`ipcluster` command provides a simple way of starting a controller and engines in the following situations:
50 50
51 51 1. When the controller and engines are all run on localhost. This is useful
52 52 for testing or running on a multicore computer.
53 53 2. When engines are started using the :command:`mpirun` command that comes
54 54 with most MPI [MPI]_ implementations
55 55 3. When engines are started using the PBS [PBS]_ batch system.
56 4. When the controller is started on localhost and the engines are started on
57 remote nodes using :command:`ssh`.
56 58
57 59 .. note::
58 60
59 61 It is also possible for advanced users to add support to
60 62 :command:`ipcluster` for starting controllers and engines using other
61 63 methods (like Sun's Grid Engine for example).
62 64
63 65 .. note::
64 66
65 67 Currently :command:`ipcluster` requires that the
66 68 :file:`~/.ipython/security` directory live on a shared filesystem that is
67 69 seen by both the controller and engines. If you don't have a shared file
68 70 system you will need to use :command:`ipcontroller` and
69 :command:`ipengine` directly.
71 :command:`ipengine` directly. This constraint can be relaxed if you are
72 using the :command:`ssh` method to start the cluster.
70 73
71 74 Underneath the hood, :command:`ipcluster` just uses :command:`ipcontroller`
72 75 and :command:`ipengine` to perform the steps described above.
73 76
74 77 Using :command:`ipcluster` in local mode
75 78 ----------------------------------------
76 79
77 80 To start one controller and 4 engines on localhost, just do::
78 81
79 82 $ ipcluster local -n 4
80 83
81 84 To see other command line options for the local mode, do::
82 85
83 86 $ ipcluster local -h
84 87
85 88 Using :command:`ipcluster` in mpirun mode
86 89 -----------------------------------------
87 90
88 91 The mpirun mode is useful if you:
89 92
90 93 1. Have MPI installed.
91 94 2. Your systems are configured to use the :command:`mpirun` command to start
92 95 processes.
93 96
94 97 If these are satisfied, you can start an IPython cluster using::
95 98
96 99 $ ipcluster mpirun -n 4
97 100
98 101 This does the following:
99 102
100 103 1. Starts the IPython controller on current host.
101 104 2. Uses :command:`mpirun` to start 4 engines.
102 105
103 106 On newer MPI implementations (such as OpenMPI), this will work even if you don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI implementations actually require each process to call :func:`MPI_Init` upon starting. The easiest way of having this done is to install the mpi4py [mpi4py]_ package and then call ipcluster with the ``--mpi`` option::
104 107
105 108 $ ipcluster mpirun -n 4 --mpi=mpi4py
106 109
107 110 Unfortunately, even this won't work for some MPI implementations. If you are having problems with this, you will likely have to use a custom Python executable that itself calls :func:`MPI_Init` at the appropriate time. Fortunately, mpi4py comes with such a custom Python executable that is easy to install and use. However, this custom Python executable approach will not work with :command:`ipcluster` currently.
108 111
109 112 Additional command line options for this mode can be found by doing::
110 113
111 114 $ ipcluster mpirun -h
112 115
113 116 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
114 117
115 118
116 119 Using :command:`ipcluster` in PBS mode
117 120 --------------------------------------
118 121
119 122 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To use this mode, you first need to create a PBS script template that will be used to start the engines. Here is a sample PBS script template:
120 123
121 124 .. sourcecode:: bash
122 125
123 126 #PBS -N ipython
124 127 #PBS -j oe
125 128 #PBS -l walltime=00:10:00
126 129 #PBS -l nodes=${n/4}:ppn=4
127 130 #PBS -q parallel
128 131
129 132 cd $$PBS_O_WORKDIR
130 133 export PATH=$$HOME/usr/local/bin
131 134 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
132 135 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
133 136
134 137 There are a few important points about this template:
135 138
136 139 1. This template will be rendered at runtime using IPython's :mod:`Itpl`
137 140 template engine.
138 141
139 142 2. Instead of putting in the actual number of engines, use the notation
140 143 ``${n}`` to indicate the number of engines to be started. You can also uses
141 144 expressions like ``${n/4}`` in the template to indicate the number of
142 145 nodes.
143 146
144 147 3. Because ``$`` is a special character used by the template engine, you must
145 148 escape any ``$`` by using ``$$``. This is important when referring to
146 149 environment variables in the template.
147 150
148 151 4. Any options to :command:`ipengine` should be given in the batch script
149 152 template.
150 153
151 154 5. Depending on the configuration of you system, you may have to set
152 155 environment variables in the script template.
153 156
154 157 Once you have created such a script, save it with a name like :file:`pbs.template`. Now you are ready to start your job::
155 158
156 159 $ ipcluster pbs -n 128 --pbs-script=pbs.template
157 160
158 161 Additional command line options for this mode can be found by doing::
159 162
160 163 $ ipcluster pbs -h
161 164
165 Using :command:`ipcluster` in SSH mode
166 --------------------------------------
167
168 The SSH mode uses :command:`ssh` to execute :command:`ipengine` on remote
169 nodes and the :command:`ipcontroller` on localhost.
170
171 When using using this mode it highly recommended that you have set up SSH keys and are using ssh-agent [SSH]_ for password-less logins.
172
173 To use this mode you need a python file describing the cluster, here is an example of such a "clusterfile":
174
175 .. sourcecode:: python
176
177 send_furl = True
178 engines = { 'host1.example.com' : 2,
179 'host2.example.com' : 5,
180 'host3.example.com' : 1,
181 'host4.example.com' : 8 }
182
183 Since this is a regular python file usual python syntax applies. Things to note:
184
185 * The `engines` dict, where the keys is the host we want to run engines on and
186 the value is the number of engines to run on that host.
187 * send_furl can either be `True` or `False`, if `True` it will copy over the
188 furl needed for :command:`ipengine` to each host.
189
190 The ``--clusterfile`` command line option lets you specify the file to use for
191 the cluster definition. Once you have your cluster file and you can
192 :command:`ssh` into the remote hosts with out an password you are ready to
193 start your cluster like so:
194
195 .. sourcecode:: bash
196
197 $ ipcluster ssh --clusterfile /path/to/my/clusterfile.py
198
199
200 Two helper shell scripts are used to start and stop :command:`ipengine` on remote hosts:
201
202 * sshx.sh
203 * engine_killer.sh
204
205 Defaults for both of these are contained in the source code for :command:`ipcluster`. The default scripts are written to a local file in a tmep directory and then copied to a temp directory on the remote host and executed from there. On most Unix, Linux and OS X systems this is /tmp.
206
207 The default sshx.sh is the following:
208
209 .. sourcecode:: bash
210
211 #!/bin/sh
212 "$@" &> /dev/null &
213 echo $!
214
215 If you want to use a custom sshx.sh script you need to use the ``--sshx``
216 option and specify the file to use. Using a custom sshx.sh file could be
217 helpful when you need to setup the environment on the remote host before
218 executing :command:`ipengine`.
219
220 For a detailed options list:
221
222 .. sourcecode:: bash
223
224 $ ipcluster ssh -h
225
226 Current limitations of the SSH mode of :command:`ipcluster` are:
227
228 * Untested on Windows. Would require a working :command:`ssh` on Windows.
229 Also, we are using shell scripts to setup and execute commands on remote
230 hosts.
231 * :command:`ipcontroller` is started on localhost, with no option to start it
232 on a remote node.
233
162 234 Using the :command:`ipcontroller` and :command:`ipengine` commands
163 235 ==================================================================
164 236
165 237 It is also possible to use the :command:`ipcontroller` and :command:`ipengine` commands to start your controller and engines. This approach gives you full control over all aspects of the startup process.
166 238
167 239 Starting the controller and engine on your local machine
168 240 --------------------------------------------------------
169 241
170 242 To use :command:`ipcontroller` and :command:`ipengine` to start things on your
171 243 local machine, do the following.
172 244
173 245 First start the controller::
174 246
175 247 $ ipcontroller
176 248
177 249 Next, start however many instances of the engine you want using (repeatedly) the command::
178 250
179 251 $ ipengine
180 252
181 253 The engines should start and automatically connect to the controller using the FURL files in :file:`~./ipython/security`. You are now ready to use the controller and engines from IPython.
182 254
183 255 .. warning::
184 256
185 257 The order of the above operations is very important. You *must*
186 258 start the controller before the engines, since the engines connect
187 259 to the controller as they get started.
188 260
189 261 .. note::
190 262
191 263 On some platforms (OS X), to put the controller and engine into the
192 264 background you may need to give these commands in the form ``(ipcontroller
193 265 &)`` and ``(ipengine &)`` (with the parentheses) for them to work
194 266 properly.
195 267
196 268 Starting the controller and engines on different hosts
197 269 ------------------------------------------------------
198 270
199 271 When the controller and engines are running on different hosts, things are
200 272 slightly more complicated, but the underlying ideas are the same:
201 273
202 274 1. Start the controller on a host using :command:`ipcontroller`.
203 275 2. Copy :file:`ipcontroller-engine.furl` from :file:`~./ipython/security` on the controller's host to the host where the engines will run.
204 276 3. Use :command:`ipengine` on the engine's hosts to start the engines.
205 277
206 278 The only thing you have to be careful of is to tell :command:`ipengine` where the :file:`ipcontroller-engine.furl` file is located. There are two ways you can do this:
207 279
208 280 * Put :file:`ipcontroller-engine.furl` in the :file:`~./ipython/security`
209 281 directory on the engine's host, where it will be found automatically.
210 282 * Call :command:`ipengine` with the ``--furl-file=full_path_to_the_file``
211 283 flag.
212 284
213 285 The ``--furl-file`` flag works like this::
214 286
215 287 $ ipengine --furl-file=/path/to/my/ipcontroller-engine.furl
216 288
217 289 .. note::
218 290
219 291 If the controller's and engine's hosts all have a shared file system
220 292 (:file:`~./ipython/security` is the same on all of them), then things
221 293 will just work!
222 294
223 295 Make FURL files persistent
224 296 ---------------------------
225 297
226 298 At fist glance it may seem that that managing the FURL files is a bit annoying. Going back to the house and key analogy, copying the FURL around each time you start the controller is like having to make a new key every time you want to unlock the door and enter your house. As with your house, you want to be able to create the key (or FURL file) once, and then simply use it at any point in the future.
227 299
228 300 This is possible. The only thing you have to do is decide what ports the controller will listen on for the engines and clients. This is done as follows::
229 301
230 302 $ ipcontroller -r --client-port=10101 --engine-port=10102
231 303
232 304 Then, just copy the furl files over the first time and you are set. You can start and stop the controller and engines any many times as you want in the future, just make sure to tell the controller to use the *same* ports.
233 305
234 306 .. note::
235 307
236 308 You may ask the question: what ports does the controller listen on if you
237 309 don't tell is to use specific ones? The default is to use high random port
238 310 numbers. We do this for two reasons: i) to increase security through
239 311 obscurity and ii) to multiple controllers on a given host to start and
240 312 automatically use different ports.
241 313
242 314 Log files
243 315 ---------
244 316
245 317 All of the components of IPython have log files associated with them.
246 318 These log files can be extremely useful in debugging problems with
247 319 IPython and can be found in the directory :file:`~/.ipython/log`. Sending
248 320 the log files to us will often help us to debug any problems.
249 321
250 322
251 323 .. [PBS] Portable Batch System. http://www.openpbs.org/
324 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
General Comments 0
You need to be logged in to leave comments. Login now