##// END OF EJS Templates
adjustments to PBS/SGE, SSH Launchers + docs update
MinRK -
Show More
@@ -68,23 +68,23 b' c = get_config()'
68 68 # MPIExec launchers
69 69 #-----------------------------------------------------------------------------
70 70
71 # The mpiexec/mpirun command to use in started the controller.
72 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
71 # The mpiexec/mpirun command to use in both the controller and engines.
72 # c.MPIExecLauncher.mpi_cmd = ['mpiexec']
73 73
74 74 # Additional arguments to pass to the actual mpiexec command.
75 # c.MPIExecLauncher.mpi_args = []
76
77 # The mpiexec/mpirun command and args can be overridden if they should be different
78 # for controller and engines.
79 # c.MPIExecControllerLauncher.mpi_cmd = ['mpiexec']
75 80 # c.MPIExecControllerLauncher.mpi_args = []
81 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
82 # c.MPIExecEngineSetLauncher.mpi_args = []
76 83
77 84 # The command line argument to call the controller with.
78 85 # c.MPIExecControllerLauncher.controller_args = \
79 86 # ['--log-to-file','--log-level', '40']
80 87
81
82 # The mpiexec/mpirun command to use in started the controller.
83 # c.MPIExecEngineSetLauncher.mpi_cmd = ['mpiexec']
84
85 # Additional arguments to pass to the actual mpiexec command.
86 # c.MPIExecEngineSetLauncher.mpi_args = []
87
88 88 # Command line argument passed to the engines.
89 89 # c.MPIExecEngineSetLauncher.engine_args = ['--log-to-file','--log-level', '40']
90 90
@@ -95,28 +95,62 b' c = get_config()'
95 95 # SSH launchers
96 96 #-----------------------------------------------------------------------------
97 97
98 # Todo
98 # ipclusterz can be used to launch controller and engines remotely via ssh.
99 # Note that currently ipclusterz does not do any file distribution, so if
100 # machines are not on a shared filesystem, config and json files must be
101 # distributed. For this reason, the reuse_files defaults to True on an
102 # ssh-launched Controller. This flag can be overridded by the program_args
103 # attribute of c.SSHControllerLauncher.
104
105 # set the ssh cmd for launching remote commands. The default is ['ssh']
106 # c.SSHLauncher.ssh_cmd = ['ssh']
107
108 # set the ssh cmd for launching remote commands. The default is ['ssh']
109 # c.SSHLauncher.ssh_args = ['tt']
110
111 # Set the user and hostname for the controller
112 # c.SSHControllerLauncher.hostname = 'controller.example.com'
113 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
99 114
115 # Set the arguments to be passed to ipcontrollerz
116 # note that remotely launched ipcontrollerz will not get the contents of
117 # the local ipcontrollerz_config.py unless it resides on the *remote host*
118 # in the location specified by the --cluster_dir argument.
119 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
120
121 # Set the default args passed to ipenginez for SSH launched engines
122 # c.SSHEngineSetLauncher.engine_args = ['--mpi', 'mpi4py']
123
124 # SSH engines are launched as a dict of locations/n-engines.
125 # if a value is a tuple instead of an int, it is assumed to be of the form
126 # (n, [args]), setting the arguments to passed to ipenginez on `host`.
127 # otherwise, c.SSHEngineSetLauncher.engine_args will be used as the default.
128
129 # In this case, there will be 3 engines at my.example.com, and
130 # 2 at you@ipython.scipy.org with a special json connector location.
131 # c.SSHEngineSetLauncher.engines = {'my.example.com' : 3,
132 # 'you@ipython.scipy.org' : (2, ['-f', '/path/to/ipcontroller-engine.json']}
133 # }
100 134
101 135 #-----------------------------------------------------------------------------
102 136 # Unix batch (PBS) schedulers launchers
103 137 #-----------------------------------------------------------------------------
104 138
105 139 # The command line program to use to submit a PBS job.
106 # c.PBSControllerLauncher.submit_command = 'qsub'
140 # c.PBSControllerLauncher.submit_command = ['qsub']
107 141
108 142 # The command line program to use to delete a PBS job.
109 # c.PBSControllerLauncher.delete_command = 'qdel'
143 # c.PBSControllerLauncher.delete_command = ['qdel']
110 144
111 145 # A regular expression that takes the output of qsub and find the job id.
112 146 # c.PBSControllerLauncher.job_id_regexp = r'\d+'
113 147
114 148 # The batch submission script used to start the controller. This is where
115 # environment variables would be setup, etc. This string is interpolated using
149 # environment variables would be setup, etc. This string is interpreted using
116 150 # the Itpl module in IPython.external. Basically, you can use ${n} for the
117 151 # number of engine and ${cluster_dir} for the cluster_dir.
118 152 # c.PBSControllerLauncher.batch_template = """
119 # #PBS -l nprocs=$n
153 # #PBS -N ipcontroller
120 154 #
121 155 # ipcontrollerz --cluster-dir $cluster_dir
122 156 # """
@@ -136,10 +170,11 b' c = get_config()'
136 170 # c.PBSEngineSetLauncher.job_id_regexp = r'\d+'
137 171
138 172 # The batch submission script used to start the engines. This is where
139 # environment variables would be setup, etc. This string is interpolated using
173 # environment variables would be setup, etc. This string is interpreted using
140 174 # the Itpl module in IPython.external. Basically, you can use ${n} for the
141 175 # number of engine and ${cluster_dir} for the cluster_dir.
142 176 # c.PBSEngineSetLauncher.batch_template = """
177 # #PBS -N ipcontroller
143 178 # #PBS -l nprocs=$n
144 179 #
145 180 # ipenginez --cluster-dir $cluster_dir$s
@@ -10,9 +10,9 b''
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12 12
13 from .asyncresult import *
14 from .client import Client
15 from .dependency import *
16 from .remotefunction import *
17 from .view import *
13 # from .asyncresult import *
14 # from .client import Client
15 # from .dependency import *
16 # from .remotefunction import *
17 # from .view import *
18 18
@@ -15,10 +15,10 b' Facilities for launching IPython processes asynchronously.'
15 15 # Imports
16 16 #-----------------------------------------------------------------------------
17 17
18 import copy
18 19 import logging
19 20 import os
20 21 import re
21 import sys
22 22
23 23 from signal import SIGINT, SIGTERM
24 24 try:
@@ -30,12 +30,9 b' from subprocess import Popen, PIPE, STDOUT'
30 30 try:
31 31 from subprocess import check_output
32 32 except ImportError:
33 # pre-2.7:
34 from StringIO import StringIO
35
33 # pre-2.7, define check_output with Popen
36 34 def check_output(*args, **kwargs):
37 sio = StringIO()
38 kwargs.update(dict(stdout=PIPE, stderr=STDOUT))
35 kwargs.update(dict(stdout=PIPE))
39 36 p = Popen(*args, **kwargs)
40 37 out,err = p.communicate()
41 38 return out
@@ -44,7 +41,7 b' from zmq.eventloop import ioloop'
44 41
45 42 from IPython.external import Itpl
46 43 # from IPython.config.configurable import Configurable
47 from IPython.utils.traitlets import Str, Int, List, Unicode, Dict, Instance
44 from IPython.utils.traitlets import Any, Str, Int, List, Unicode, Dict, Instance
48 45 from IPython.utils.path import get_ipython_module_path
49 46 from IPython.utils.process import find_cmd, pycmd2argv, FindCmdError
50 47
@@ -106,6 +103,10 b' class BaseLauncher(LoggingFactory):'
106 103 # the --work-dir option.
107 104 work_dir = Unicode(u'.')
108 105 loop = Instance('zmq.eventloop.ioloop.IOLoop')
106
107 start_data = Any()
108 stop_data = Any()
109
109 110 def _loop_default(self):
110 111 return ioloop.IOLoop.instance()
111 112
@@ -346,11 +347,13 b' class LocalEngineSetLauncher(BaseLauncher):'
346 347 # launcher class
347 348 launcher_class = LocalEngineLauncher
348 349
350 launchers = Dict()
351 stop_data = Dict()
352
349 353 def __init__(self, work_dir=u'.', config=None, **kwargs):
350 354 super(LocalEngineSetLauncher, self).__init__(
351 355 work_dir=work_dir, config=config, **kwargs
352 356 )
353 self.launchers = {}
354 357 self.stop_data = {}
355 358
356 359 def start(self, n, cluster_dir):
@@ -360,7 +363,6 b' class LocalEngineSetLauncher(BaseLauncher):'
360 363 for i in range(n):
361 364 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
362 365 # Copy the engine args over to each engine launcher.
363 import copy
364 366 el.engine_args = copy.deepcopy(self.engine_args)
365 367 el.on_stop(self._notice_engine_stopped)
366 368 d = el.start(cluster_dir)
@@ -397,7 +399,6 b' class LocalEngineSetLauncher(BaseLauncher):'
397 399 return self.interrupt_then_kill()
398 400
399 401 def _notice_engine_stopped(self, data):
400 print "notice", data
401 402 pid = data['pid']
402 403 for idx,el in self.launchers.iteritems():
403 404 if el.process.pid == pid:
@@ -429,7 +430,7 b' class MPIExecLauncher(LocalProcessLauncher):'
429 430
430 431 def find_args(self):
431 432 """Build self.args using all the fields."""
432 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
433 return self.mpi_cmd + ['-n', str(self.n)] + self.mpi_args + \
433 434 self.program + self.program_args
434 435
435 436 def start(self, n):
@@ -460,26 +461,21 b' class MPIExecControllerLauncher(MPIExecLauncher):'
460 461
461 462 class MPIExecEngineSetLauncher(MPIExecLauncher):
462 463
463 engine_cmd = List(ipengine_cmd_argv, config=True)
464 program = List(ipengine_cmd_argv, config=True)
464 465 # Command line arguments for ipengine.
465 engine_args = List(
466 program_args = List(
466 467 ['--log-to-file','--log-level', str(logging.INFO)], config=True
467 468 )
468 469 n = Int(1, config=True)
469 470
470 471 def start(self, n, cluster_dir):
471 472 """Start n engines by profile or cluster_dir."""
472 self.engine_args.extend(['--cluster-dir', cluster_dir])
473 self.program_args.extend(['--cluster-dir', cluster_dir])
473 474 self.cluster_dir = unicode(cluster_dir)
474 475 self.n = n
475 476 self.log.info('Starting MPIExecEngineSetLauncher: %r' % self.args)
476 477 return super(MPIExecEngineSetLauncher, self).start(n)
477 478
478 def find_args(self):
479 return self.mpi_cmd + ['-n', self.n] + self.mpi_args + \
480 self.engine_cmd + self.engine_args
481
482
483 479 #-----------------------------------------------------------------------------
484 480 # SSH launchers
485 481 #-----------------------------------------------------------------------------
@@ -499,11 +495,14 b' class SSHLauncher(LocalProcessLauncher):'
499 495 program = List(['date'], config=True)
500 496 program_args = List([], config=True)
501 497 hostname = Str('', config=True)
502 user = Str(os.environ.get('USER','username'), config=True)
498 user = Str('', config=True)
503 499 location = Str('')
504 500
505 501 def _hostname_changed(self, name, old, new):
506 self.location = '%s@%s' % (self.user, new)
502 if self.user:
503 self.location = '%s@%s' % (self.user, new)
504 else:
505 self.location = new
507 506
508 507 def _user_changed(self, name, old, new):
509 508 self.location = '%s@%s' % (new, self.hostname)
@@ -513,12 +512,12 b' class SSHLauncher(LocalProcessLauncher):'
513 512 self.program + self.program_args
514 513
515 514 def start(self, cluster_dir, hostname=None, user=None):
516 print self.config
515 self.cluster_dir = unicode(cluster_dir)
517 516 if hostname is not None:
518 517 self.hostname = hostname
519 518 if user is not None:
520 519 self.user = user
521 print (self.location, hostname, user)
520
522 521 return super(SSHLauncher, self).start()
523 522
524 523 def signal(self, sig):
@@ -533,7 +532,7 b' class SSHControllerLauncher(SSHLauncher):'
533 532
534 533 program = List(ipcontroller_cmd_argv, config=True)
535 534 # Command line arguments to ipcontroller.
536 program_args = List(['--log-to-file','--log-level', str(logging.INFO)], config=True)
535 program_args = List(['-r', '--log-to-file','--log-level', str(logging.INFO)], config=True)
537 536
538 537
539 538 class SSHEngineLauncher(SSHLauncher):
@@ -545,6 +544,40 b' class SSHEngineLauncher(SSHLauncher):'
545 544
546 545 class SSHEngineSetLauncher(LocalEngineSetLauncher):
547 546 launcher_class = SSHEngineLauncher
547 engines = Dict(config=True)
548
549 def start(self, n, cluster_dir):
550 """Start engines by profile or cluster_dir.
551 `n` is ignored, and the `engines` config property is used instead.
552 """
553
554 self.cluster_dir = unicode(cluster_dir)
555 dlist = []
556 for host, n in self.engines.iteritems():
557 if isinstance(n, (tuple, list)):
558 n, args = n
559 else:
560 args = copy.deepcopy(self.engine_args)
561
562 if '@' in host:
563 user,host = host.split('@',1)
564 else:
565 user=None
566 for i in range(n):
567 el = self.launcher_class(work_dir=self.work_dir, config=self.config, logname=self.log.name)
568
569 # Copy the engine args over to each engine launcher.
570 i
571 el.program_args = args
572 el.on_stop(self._notice_engine_stopped)
573 d = el.start(cluster_dir, user=user, hostname=host)
574 if i==0:
575 self.log.info("Starting SSHEngineSetLauncher: %r" % el.args)
576 self.launchers[host+str(i)] = el
577 dlist.append(d)
578 self.notify_start(dlist)
579 return dlist
580
548 581
549 582
550 583 #-----------------------------------------------------------------------------
@@ -619,7 +652,7 b' class WindowsHPCLauncher(BaseLauncher):'
619 652 stderr=STDOUT
620 653 )
621 654 job_id = self.parse_job_id(output)
622 # self.notify_start(job_id)
655 self.notify_start(job_id)
623 656 return job_id
624 657
625 658 def stop(self):
@@ -637,7 +670,7 b' class WindowsHPCLauncher(BaseLauncher):'
637 670 )
638 671 except:
639 672 output = 'The job already appears to be stoppped: %r' % self.job_id
640 self.notify_stop(output) # Pass the output of the kill cmd
673 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
641 674 return output
642 675
643 676
@@ -708,8 +741,6 b' class WindowsHPCEngineSetLauncher(WindowsHPCLauncher):'
708 741 # Batch (PBS) system launchers
709 742 #-----------------------------------------------------------------------------
710 743
711 # TODO: Get PBS launcher working again.
712
713 744 class BatchSystemLauncher(BaseLauncher):
714 745 """Launch an external process using a batch system.
715 746
@@ -743,7 +774,7 b' class BatchSystemLauncher(BaseLauncher):'
743 774
744 775
745 776 def find_args(self):
746 return [self.submit_command]
777 return [self.submit_command, self.batch_file]
747 778
748 779 def __init__(self, work_dir=u'.', config=None, **kwargs):
749 780 super(BatchSystemLauncher, self).__init__(
@@ -753,13 +784,13 b' class BatchSystemLauncher(BaseLauncher):'
753 784
754 785 def parse_job_id(self, output):
755 786 """Take the output of the submit command and return the job id."""
756 m = re.match(self.job_id_regexp, output)
787 m = re.search(self.job_id_regexp, output)
757 788 if m is not None:
758 789 job_id = m.group()
759 790 else:
760 791 raise LauncherError("Job id couldn't be determined: %s" % output)
761 792 self.job_id = job_id
762 self.log.info('Job started with job id: %r' % job_id)
793 self.log.info('Job submitted with job id: %r' % job_id)
763 794 return job_id
764 795
765 796 def write_batch_script(self, n):
@@ -779,14 +810,15 b' class BatchSystemLauncher(BaseLauncher):'
779 810 self.context['cluster_dir'] = cluster_dir
780 811 self.cluster_dir = unicode(cluster_dir)
781 812 self.write_batch_script(n)
782 output = check_output([self.submit_command, self.batch_file], env=os.environ, stdout=STDOUT)
813 output = check_output(self.args, env=os.environ)
814
783 815 job_id = self.parse_job_id(output)
784 # self.notify_start(job_id)
816 self.notify_start(job_id)
785 817 return job_id
786 818
787 819 def stop(self):
788 output = check_output([self.delete_command, self.job_id], env=os.environ, stderr=STDOUT)
789 self.notify_stop(output) # Pass the output of the kill cmd
820 output = check_output([self.delete_command, self.job_id], env=os.environ)
821 self.notify_stop(dict(job_id=self.job_id, output=output)) # Pass the output of the kill cmd
790 822 return output
791 823
792 824
@@ -59,15 +59,11 b' controller and engines in the following situations:'
59 59 for testing or running on a multicore computer.
60 60 2. When engines are started using the :command:`mpirun` command that comes
61 61 with most MPI [MPI]_ implementations
62 3. When engines are started using the PBS [PBS]_ batch system.
62 3. When engines are started using the PBS [PBS]_ batch system
63 (or other `qsub` systems, such as SGE).
63 64 4. When the controller is started on localhost and the engines are started on
64 65 remote nodes using :command:`ssh`.
65
66 .. note::
67
68 It is also possible for advanced users to add support to
69 :command:`ipclusterz` for starting controllers and engines using other
70 methods (like Sun's Grid Engine for example).
66 5. When engines are started using the Windows HPC Server batch system.
71 67
72 68 .. note::
73 69
@@ -75,16 +71,14 b' controller and engines in the following situations:'
75 71 :file:`~/.ipython/cluster_<profile>/security` directory live on a shared filesystem that is
76 72 seen by both the controller and engines. If you don't have a shared file
77 73 system you will need to use :command:`ipcontrollerz` and
78 :command:`ipenginez` directly. This constraint can be relaxed if you are
79 using the :command:`ssh` method to start the cluster.
74 :command:`ipenginez` directly.
80 75
81 76 Under the hood, :command:`ipclusterz` just uses :command:`ipcontrollerz`
82 77 and :command:`ipenginez` to perform the steps described above.
83 78
84 Using :command:`ipclusterz` in local mode
85 ----------------------------------------
86
87 To start one controller and 4 engines on localhost, just do::
79 The simplest way to use ipclusterz requires no configuration, and will
80 launch a controller and a number of engines on the local machine. For instance,
81 to start one controller and 4 engines on localhost, just do::
88 82
89 83 $ ipclusterz start -n 4
90 84
@@ -92,17 +86,42 b' To see other command line options for the local mode, do::'
92 86
93 87 $ ipclusterz -h
94 88
95 .. note::
96 89
97 The remainder of this section refers to the 0.10 clusterfile model, no longer in use.
98 skip to
90 Configuring an IPython cluster
91 ==============================
99 92
100 Using :command:`ipclusterz` in mpiexec/mpirun mode
101 -------------------------------------------------
93 Cluster configurations are stored as `profiles`. You can create a new profile with::
94
95 $ ipclusterz create -p myprofile
96
97 This will create the directory :file:`IPYTHONDIR/clusterz_myprofile`, and populate it
98 with the default configuration files for the three IPython cluster commands. Once
99 you edit those files, you can continue to call ipclusterz/ipcontrollerz/ipenginez
100 with no arguments beyond ``-p myprofile``, and any configuration will be maintained.
101
102 There is no limit to the number of profiles you can have, so you can maintain a profile for each
103 of your common use cases. The default profile will be used whenever the
104 profile argument is not specified, so edit :file:`IPYTHONDIR/clusterz_default/*_config.py` to
105 represent your most common use case.
106
107 The configuration files are loaded with commented-out settings and explanations,
108 which should cover most of the available possibilities.
109
110 Using various batch systems with :command:`ipclusterz`
111 ------------------------------------------------------
112
113 :command:`ipclusterz` has a notion of Launchers that can start controllers
114 and engines with various remote execution schemes. Currently supported
115 models include `mpiexec`, PBS-style (Torque, SGE), and Windows HPC Server.
102 116
103 117 .. note::
104 118
105 This section is out of date for IPython 0.11
119 The Launchers and configuration are designed in such a way that advanced
120 users can subclass and configure them to fit their own system that we
121 have not yet supported (such as Condor)
122
123 Using :command:`ipclusterz` in mpiexec/mpirun mode
124 --------------------------------------------------
106 125
107 126
108 127 The mpiexec/mpirun mode is useful if you:
@@ -111,29 +130,47 b' The mpiexec/mpirun mode is useful if you:'
111 130 2. Your systems are configured to use the :command:`mpiexec` or
112 131 :command:`mpirun` commands to start MPI processes.
113 132
114 .. note::
133 If these are satisfied, you can create a new profile::
134
135 $ ipclusterz create -p mpi
136
137 and edit the file :file:`IPYTHONDIR/clusterz_mpi/ipclusterz_config.py`.
115 138
116 The preferred command to use is :command:`mpiexec`. However, we also
117 support :command:`mpirun` for backwards compatibility. The underlying
118 logic used is exactly the same, the only difference being the name of the
119 command line program that is called.
139 There, instruct ipclusterz to use the MPIExec launchers by adding the lines:
120 140
121 If these are satisfied, you can start an IPython cluster using::
141 .. sourcecode:: python
142
143 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.MPIExecEngineSetLauncher'
144
145 If the default MPI configuration is correct, then you can now start your cluster, with::
122 146
123 $ ipclusterz mpiexec -n 4
147 $ ipclusterz start -n 4 -p mpi
124 148
125 149 This does the following:
126 150
127 151 1. Starts the IPython controller on current host.
128 152 2. Uses :command:`mpiexec` to start 4 engines.
129 153
154 If you have a reason to also start the Controller with mpi, you can specify:
155
156 .. sourcecode:: python
157
158 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.MPIExecControllerLauncher'
159
160 .. note::
161
162 The Controller *will not* be in the same MPI universe as the engines, so there is not
163 much reason to do this unless sysadmins demand it.
164
130 165 On newer MPI implementations (such as OpenMPI), this will work even if you
131 166 don't make any calls to MPI or call :func:`MPI_Init`. However, older MPI
132 167 implementations actually require each process to call :func:`MPI_Init` upon
133 168 starting. The easiest way of having this done is to install the mpi4py
134 [mpi4py]_ package and then call ipclusterz with the ``--mpi`` option::
169 [mpi4py]_ package and then specify the ``c.MPI.use`` option in :file:`ipenginez_config.py`:
170
171 .. sourcecode:: python
135 172
136 $ ipclusterz mpiexec -n 4 --mpi=mpi4py
173 c.MPI.use = 'mpi4py'
137 174
138 175 Unfortunately, even this won't work for some MPI implementations. If you are
139 176 having problems with this, you will likely have to use a custom Python
@@ -142,23 +179,27 b' Fortunately, mpi4py comes with such a custom Python executable that is easy to'
142 179 install and use. However, this custom Python executable approach will not work
143 180 with :command:`ipclusterz` currently.
144 181
145 Additional command line options for this mode can be found by doing::
146
147 $ ipclusterz mpiexec -h
148
149 182 More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.
150 183
151 184
152 185 Using :command:`ipclusterz` in PBS mode
153 --------------------------------------
186 ---------------------------------------
154 187
155 .. note::
188 The PBS mode uses the Portable Batch System [PBS]_ to start the engines.
189
190 As usual, we will start by creating a fresh profile::
191
192 $ ipclusterz create -p pbs
193
194 And in :file:`ipclusterz_config.py`, we will select the PBS launchers for the controller
195 and engines:
156 196
157 This section is out of date for IPython 0.11
197 .. sourcecode:: python
158 198
199 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.PBSControllerLauncher'
200 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
159 201
160 The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To
161 use this mode, you first need to create a PBS script template that will be
202 To use this mode, you first need to create a PBS script template that will be
162 203 used to start the engines. Here is a sample PBS script template:
163 204
164 205 .. sourcecode:: bash
@@ -171,8 +212,8 b' used to start the engines. Here is a sample PBS script template:'
171 212
172 213 cd $$PBS_O_WORKDIR
173 214 export PATH=$$HOME/usr/local/bin
174 export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages
175 /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine
215 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
216 /usr/local/bin/mpiexec -n ${n} ipenginez --cluster_dir=${cluster_dir}
176 217
177 218 There are a few important points about this template:
178 219
@@ -182,107 +223,147 b' There are a few important points about this template:'
182 223 2. Instead of putting in the actual number of engines, use the notation
183 224 ``${n}`` to indicate the number of engines to be started. You can also uses
184 225 expressions like ``${n/4}`` in the template to indicate the number of
185 nodes.
226 nodes. There will always be a ${n} and ${cluster_dir} variable passed to the template.
227 These allow the batch system to know how many engines, and where the configuration
228 files reside.
186 229
187 230 3. Because ``$`` is a special character used by the template engine, you must
188 231 escape any ``$`` by using ``$$``. This is important when referring to
189 232 environment variables in the template.
190 233
191 4. Any options to :command:`ipenginez` should be given in the batch script
192 template.
234 4. Any options to :command:`ipenginez` can be given in the batch script
235 template, or in :file:`ipenginez_config.py`.
193 236
194 237 5. Depending on the configuration of you system, you may have to set
195 238 environment variables in the script template.
196 239
197 Once you have created such a script, save it with a name like
198 :file:`pbs.template`. Now you are ready to start your job::
240 The controller template should be similar, but simpler:
241
242 .. sourcecode:: bash
243
244 #PBS -N ipython
245 #PBS -j oe
246 #PBS -l walltime=00:10:00
247 #PBS -l nodes=1:ppn=4
248 #PBS -q parallel
199 249
200 $ ipclusterz pbs -n 128 --pbs-script=pbs.template
250 cd $$PBS_O_WORKDIR
251 export PATH=$$HOME/usr/local/bin
252 export PYTHONPATH=$$HOME/usr/local/lib/python2.7/site-packages
253 ipcontrollerz --cluster_dir=${cluster_dir}
201 254
202 Additional command line options for this mode can be found by doing::
203 255
204 $ ipclusterz pbs -h
256 Once you have created these scripts, save them with names like
257 :file:`pbs.engine.template`. Now you can load them into the :file:`ipclusterz_config` with:
205 258
206 Using :command:`ipclusterz` in SSH mode
207 --------------------------------------
259 .. sourcecode:: python
260
261 with open("pbs.engine.template") as f:
262 c.PBSEngineSetLauncher.batch_template = f.read()
263
264 with open("pbs.controller.template") as f:
265 c.PBSControllerLauncher.batch_template = f.read()
266
267
268 Alternately, you can just define the templates as strings inside :file:`ipclusterz_config`.
269
270 Note that assuming you are running PBS on a multi-node cluster, the Controller's default behavior
271 of listening only on localhost is likely too restrictive. In this case, also assuming the
272 nodes are safely behind a firewall, you can simply instruct the Controller to listen for
273 connections on all its interfaces, by adding in :file:`ipcontrollerz_config`:
274
275 .. sourcecode:: python
276
277 c.HubFactory.client_ip = '*'
278 c.HubFactory.engine_ip = '*'
279
280 You can now run the cluster with::
281
282 $ ipclusterz start -p pbs -n 128
283
284 Additional configuration options can be found in the PBS section of :file:`ipclusterz_config`.
208 285
209 286 .. note::
210 287
211 This section is out of date for IPython 0.11
288 Due to the flexibility of configuration, the PBS launchers work with simple changes
289 to the template for other :command:`qsub`-using systems, such as Sun Grid Engine,
290 and with further configuration in similar batch systems like Condor.
291
292
293 Using :command:`ipclusterz` in SSH mode
294 ---------------------------------------
212 295
213 296
214 297 The SSH mode uses :command:`ssh` to execute :command:`ipenginez` on remote
215 nodes and the :command:`ipcontrollerz` on localhost.
298 nodes and :command:`ipcontrollerz` can be run remotely as well, or on localhost.
216 299
217 When using using this mode it highly recommended that you have set up SSH keys
218 and are using ssh-agent [SSH]_ for password-less logins.
300 .. note::
219 301
220 To use this mode you need a python file describing the cluster, here is an
221 example of such a "clusterfile":
302 When using this mode it highly recommended that you have set up SSH keys
303 and are using ssh-agent [SSH]_ for password-less logins.
222 304
223 .. sourcecode:: python
224
225 send_furl = True
226 engines = { 'host1.example.com' : 2,
227 'host2.example.com' : 5,
228 'host3.example.com' : 1,
229 'host4.example.com' : 8 }
305 As usual, we start by creating a clean profile::
230 306
231 Since this is a regular python file usual python syntax applies. Things to
232 note:
307 $ ipclusterz create -p ssh
233 308
234 * The `engines` dict, where the keys is the host we want to run engines on and
235 the value is the number of engines to run on that host.
236 * send_furl can either be `True` or `False`, if `True` it will copy over the
237 furl needed for :command:`ipenginez` to each host.
309 To use this mode, select the SSH launchers in :file:`ipclusterz_config.py`:
238 310
239 The ``--clusterfile`` command line option lets you specify the file to use for
240 the cluster definition. Once you have your cluster file and you can
241 :command:`ssh` into the remote hosts with out an password you are ready to
242 start your cluster like so:
311 .. sourcecode:: python
243 312
244 .. sourcecode:: bash
313 c.Global.engine_launcher = 'IPython.zmq.parallel.launcher.PBSEngineSetLauncher'
314 # and if the Controller is also to be remote:
315 c.Global.controller_launcher = 'IPython.zmq.parallel.launcher.SSHControllerLauncher'
316
245 317
246 $ ipclusterz ssh --clusterfile /path/to/my/clusterfile.py
318 The controller's remote location and configuration can be specified:
247 319
320 .. sourcecode:: python
248 321
249 Two helper shell scripts are used to start and stop :command:`ipenginez` on
250 remote hosts:
322 # Set the user and hostname for the controller
323 # c.SSHControllerLauncher.hostname = 'controller.example.com'
324 # c.SSHControllerLauncher.user = os.environ.get('USER','username')
251 325
252 * sshx.sh
253 * engine_killer.sh
326 # Set the arguments to be passed to ipcontrollerz
327 # note that remotely launched ipcontrollerz will not get the contents of
328 # the local ipcontrollerz_config.py unless it resides on the *remote host*
329 # in the location specified by the --cluster_dir argument.
330 # c.SSHControllerLauncher.program_args = ['-r', '-ip', '0.0.0.0', '--cluster_dir', '/path/to/cd']
254 331
255 Defaults for both of these are contained in the source code for
256 :command:`ipclusterz`. The default scripts are written to a local file in a
257 tmep directory and then copied to a temp directory on the remote host and
258 executed from there. On most Unix, Linux and OS X systems this is /tmp.
332 .. note::
259 333
260 The default sshx.sh is the following:
334 SSH mode does not do any file movement, so you will need to distribute configuration
335 files manually. To aid in this, the `reuse_files` flag defaults to True for ssh-launched
336 Controllers, so you will only need to do this once, unless you override this flag back
337 to False.
261 338
262 .. sourcecode:: bash
339 Engines are specified in a dictionary, by hostname and the number of engines to be run
340 on that host.
263 341
264 #!/bin/sh
265 "$@" &> /dev/null &
266 echo $!
342 .. sourcecode:: python
343
344 c.SSHEngineSetLauncher.engines = { 'host1.example.com' : 2,
345 'host2.example.com' : 5,
346 'host3.example.com' : (1, ['--cluster_dir', '/home/different/location']),
347 'host4.example.com' : 8 }
267 348
268 If you want to use a custom sshx.sh script you need to use the ``--sshx``
269 option and specify the file to use. Using a custom sshx.sh file could be
270 helpful when you need to setup the environment on the remote host before
271 executing :command:`ipenginez`.
349 * The `engines` dict, where the keys are the host we want to run engines on and
350 the value is the number of engines to run on that host.
351 * on host3, the value is a tuple, where the number of engines is first, and the arguments
352 to be passed to :command:`ipenginez` are the second element.
272 353
273 For a detailed options list:
354 For engines without explicitly specified arguments, the default arguments are set in
355 a single location:
274 356
275 .. sourcecode:: bash
357 .. sourcecode:: python
276 358
277 $ ipclusterz ssh -h
359 c.SSHEngineSetLauncher.engine_args = ['--cluster_dir', '/path/to/clusterz_ssh']
278 360
279 361 Current limitations of the SSH mode of :command:`ipclusterz` are:
280 362
281 363 * Untested on Windows. Would require a working :command:`ssh` on Windows.
282 364 Also, we are using shell scripts to setup and execute commands on remote
283 365 hosts.
284 * :command:`ipcontrollerz` is started on localhost, with no option to start it
285 on a remote node.
366 * No file movement -
286 367
287 368 Using the :command:`ipcontrollerz` and :command:`ipenginez` commands
288 369 ====================================================================
@@ -299,22 +380,22 b' local machine, do the following.'
299 380
300 381 First start the controller::
301 382
302 $ ipcontrollerz
303
383 $ ipcontrollerz
384
304 385 Next, start however many instances of the engine you want using (repeatedly)
305 386 the command::
306 387
307 $ ipenginez
388 $ ipenginez
308 389
309 390 The engines should start and automatically connect to the controller using the
310 JSON files in :file:`~/.ipython/cluster_<profile>/security`. You are now ready to use the
391 JSON files in :file:`~/.ipython/clusterz_default/security`. You are now ready to use the
311 392 controller and engines from IPython.
312 393
313 394 .. warning::
314
315 The order of the above operations may be important. You *must*
316 start the controller before the engines, unless you are manually specifying
317 the ports on which to connect, in which case ordering is not important.
395
396 The order of the above operations may be important. You *must*
397 start the controller before the engines, unless you are reusing connection
398 information (via `-r`), in which case ordering is not important.
318 399
319 400 .. note::
320 401
@@ -354,7 +435,7 b' The ``--file`` flag works like this::'
354 435 will just work!
355 436
356 437 Make JSON files persistent
357 ---------------------------
438 --------------------------
358 439
359 440 At fist glance it may seem that that managing the JSON files is a bit
360 441 annoying. Going back to the house and key analogy, copying the JSON around
@@ -363,22 +444,14 b' you want to unlock the door and enter your house. As with your house, you want'
363 444 to be able to create the key (or JSON file) once, and then simply use it at
364 445 any point in the future.
365 446
366 This is possible, but before you do this, you **must** remove any old JSON
367 files in the :file:`~/.ipython/cluster_<profile>/security` directory.
368
369 .. warning::
370
371 You **must** remove old JSON files before using persistent JSON files.
372
373 Then, the only thing you have to do is specify the registration port, so that
447 To do this, the only thing you have to do is specify the `-r` flag, so that
374 448 the connection information in the JSON files remains accurate::
375 449
376 450 $ ipcontrollerz -r --regport 12345
377 451
378
379 452 Then, just copy the JSON files over the first time and you are set. You can
380 453 start and stop the controller and engines any many times as you want in the
381 future, just make sure to tell the controller to use the *same* ports.
454 future, just make sure to tell the controller to reuse the file.
382 455
383 456 .. note::
384 457
@@ -400,4 +473,18 b' Sending the log files to us will often help us to debug any problems.'
400 473 .. [PBS] Portable Batch System. http://www.openpbs.org/
401 474 .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent
402 475
476 Configuring `ipcontrollerz`
477 ---------------------------
478
479 .. note::
480
481 TODO
482
483 Configuring `ipenginez`
484 -----------------------
485
486 .. note::
487
488 TODO
489
403 490
General Comments 0
You need to be logged in to leave comments. Login now