##// END OF EJS Templates
Removed the -f option to ipcluster. The remote starting of an IPython...
Brian Granger -
Show More
@@ -1,324 +1,342 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3
3
4 """Start an IPython cluster conveniently, either locally or remotely.
4 """Start an IPython cluster conveniently, either locally or remotely.
5
5
6 Basic usage
6 Basic usage
7 -----------
7 -----------
8
8
9 For local operation, the simplest mode of usage is:
9 For local operation, the simplest mode of usage is:
10
10
11 %prog -n N
11 %prog -n N
12
12
13 where N is the number of engines you want started.
13 where N is the number of engines you want started.
14
14
15 For remote operation, you must call it with a cluster description file:
15 For remote operation, you must call it with a cluster description file:
16
16
17 %prog -f clusterfile.py
17 %prog -f clusterfile.py
18
18
19 The cluster file is a normal Python script which gets run via execfile(). You
19 The cluster file is a normal Python script which gets run via execfile(). You
20 can have arbitrary logic in it, but all that matters is that at the end of the
20 can have arbitrary logic in it, but all that matters is that at the end of the
21 execution, it declares the variables 'controller', 'engines', and optionally
21 execution, it declares the variables 'controller', 'engines', and optionally
22 'sshx'. See the accompanying examples for details on what these variables must
22 'sshx'. See the accompanying examples for details on what these variables must
23 contain.
23 contain.
24
24
25
25
26 Notes
26 Notes
27 -----
27 -----
28
28
29 WARNING: this code is still UNFINISHED and EXPERIMENTAL! It is incomplete,
29 WARNING: this code is still UNFINISHED and EXPERIMENTAL! It is incomplete,
30 some listed options are not really implemented, and all of its interfaces are
30 some listed options are not really implemented, and all of its interfaces are
31 subject to change.
31 subject to change.
32
32
33 When operating over SSH for a remote cluster, this program relies on the
33 When operating over SSH for a remote cluster, this program relies on the
34 existence of a particular script called 'sshx'. This script must live in the
34 existence of a particular script called 'sshx'. This script must live in the
35 target systems where you'll be running your controller and engines, and is
35 target systems where you'll be running your controller and engines, and is
36 needed to configure your PATH and PYTHONPATH variables for further execution of
36 needed to configure your PATH and PYTHONPATH variables for further execution of
37 python code at the other end of an SSH connection. The script can be as simple
37 python code at the other end of an SSH connection. The script can be as simple
38 as:
38 as:
39
39
40 #!/bin/sh
40 #!/bin/sh
41 . $HOME/.bashrc
41 . $HOME/.bashrc
42 "$@"
42 "$@"
43
43
44 which is the default one provided by IPython. You can modify this or provide
44 which is the default one provided by IPython. You can modify this or provide
45 your own. Since it's quite likely that for different clusters you may need
45 your own. Since it's quite likely that for different clusters you may need
46 this script to configure things differently or that it may live in different
46 this script to configure things differently or that it may live in different
47 locations, its full path can be set in the same file where you define the
47 locations, its full path can be set in the same file where you define the
48 cluster setup. IPython's order of evaluation for this variable is the
48 cluster setup. IPython's order of evaluation for this variable is the
49 following:
49 following:
50
50
51 a) Internal default: 'sshx'. This only works if it is in the default system
51 a) Internal default: 'sshx'. This only works if it is in the default system
52 path which SSH sets up in non-interactive mode.
52 path which SSH sets up in non-interactive mode.
53
53
54 b) Environment variable: if $IPYTHON_SSHX is defined, this overrides the
54 b) Environment variable: if $IPYTHON_SSHX is defined, this overrides the
55 internal default.
55 internal default.
56
56
57 c) Variable 'sshx' in the cluster configuration file: finally, this will
57 c) Variable 'sshx' in the cluster configuration file: finally, this will
58 override the previous two values.
58 override the previous two values.
59
59
60 This code is Unix-only, with precious little hope of any of this ever working
60 This code is Unix-only, with precious little hope of any of this ever working
61 under Windows, since we need SSH from the ground up, we background processes,
61 under Windows, since we need SSH from the ground up, we background processes,
62 etc. Ports of this functionality to Windows are welcome.
62 etc. Ports of this functionality to Windows are welcome.
63
63
64
64
65 Call summary
65 Call summary
66 ------------
66 ------------
67
67
68 %prog [options]
68 %prog [options]
69 """
69 """
70
70
71 __docformat__ = "restructuredtext en"
71 __docformat__ = "restructuredtext en"
72
72
73 #-------------------------------------------------------------------------------
73 #-------------------------------------------------------------------------------
74 # Copyright (C) 2008 The IPython Development Team
74 # Copyright (C) 2008 The IPython Development Team
75 #
75 #
76 # Distributed under the terms of the BSD License. The full license is in
76 # Distributed under the terms of the BSD License. The full license is in
77 # the file COPYING, distributed as part of this software.
77 # the file COPYING, distributed as part of this software.
78 #-------------------------------------------------------------------------------
78 #-------------------------------------------------------------------------------
79
79
80 #-------------------------------------------------------------------------------
80 #-------------------------------------------------------------------------------
81 # Stdlib imports
81 # Stdlib imports
82 #-------------------------------------------------------------------------------
82 #-------------------------------------------------------------------------------
83
83
84 import os
84 import os
85 import signal
85 import signal
86 import sys
86 import sys
87 import time
87 import time
88
88
89 from optparse import OptionParser
89 from optparse import OptionParser
90 from subprocess import Popen,call
90 from subprocess import Popen,call
91
91
92 #---------------------------------------------------------------------------
92 #---------------------------------------------------------------------------
93 # IPython imports
93 # IPython imports
94 #---------------------------------------------------------------------------
94 #---------------------------------------------------------------------------
95 from IPython.tools import utils
95 from IPython.tools import utils
96 from IPython.genutils import get_ipython_dir
96 from IPython.genutils import get_ipython_dir
97
97
98 #---------------------------------------------------------------------------
98 #---------------------------------------------------------------------------
99 # Normal code begins
99 # Normal code begins
100 #---------------------------------------------------------------------------
100 #---------------------------------------------------------------------------
101
101
102 def parse_args():
102 def parse_args():
103 """Parse command line and return opts,args."""
103 """Parse command line and return opts,args."""
104
104
105 parser = OptionParser(usage=__doc__)
105 parser = OptionParser(usage=__doc__)
106 newopt = parser.add_option # shorthand
106 newopt = parser.add_option # shorthand
107
107
108 newopt("--controller-port", type="int", dest="controllerport",
108 newopt("--controller-port", type="int", dest="controllerport",
109 help="the TCP port the controller is listening on")
109 help="the TCP port the controller is listening on")
110
110
111 newopt("--controller-ip", type="string", dest="controllerip",
111 newopt("--controller-ip", type="string", dest="controllerip",
112 help="the TCP ip address of the controller")
112 help="the TCP ip address of the controller")
113
113
114 newopt("-n", "--num", type="int", dest="n",default=2,
114 newopt("-n", "--num", type="int", dest="n",default=2,
115 help="the number of engines to start")
115 help="the number of engines to start")
116
116
117 newopt("--engine-port", type="int", dest="engineport",
117 newopt("--engine-port", type="int", dest="engineport",
118 help="the TCP port the controller will listen on for engine "
118 help="the TCP port the controller will listen on for engine "
119 "connections")
119 "connections")
120
120
121 newopt("--engine-ip", type="string", dest="engineip",
121 newopt("--engine-ip", type="string", dest="engineip",
122 help="the TCP ip address the controller will listen on "
122 help="the TCP ip address the controller will listen on "
123 "for engine connections")
123 "for engine connections")
124
124
125 newopt("--mpi", type="string", dest="mpi",
125 newopt("--mpi", type="string", dest="mpi",
126 help="use mpi with package: for instance --mpi=mpi4py")
126 help="use mpi with package: for instance --mpi=mpi4py")
127
127
128 newopt("-l", "--logfile", type="string", dest="logfile",
128 newopt("-l", "--logfile", type="string", dest="logfile",
129 help="log file name")
129 help="log file name")
130
130
131 newopt('-f','--cluster-file',dest='clusterfile',
131 newopt('-f','--cluster-file',dest='clusterfile',
132 help='file describing a remote cluster')
132 help='file describing a remote cluster')
133
133
134 return parser.parse_args()
134 return parser.parse_args()
135
135
136 def numAlive(controller,engines):
136 def numAlive(controller,engines):
137 """Return the number of processes still alive."""
137 """Return the number of processes still alive."""
138 retcodes = [controller.poll()] + \
138 retcodes = [controller.poll()] + \
139 [e.poll() for e in engines]
139 [e.poll() for e in engines]
140 return retcodes.count(None)
140 return retcodes.count(None)
141
141
142 stop = lambda pid: os.kill(pid,signal.SIGINT)
142 stop = lambda pid: os.kill(pid,signal.SIGINT)
143 kill = lambda pid: os.kill(pid,signal.SIGTERM)
143 kill = lambda pid: os.kill(pid,signal.SIGTERM)
144
144
145 def cleanup(clean,controller,engines):
145 def cleanup(clean,controller,engines):
146 """Stop the controller and engines with the given cleanup method."""
146 """Stop the controller and engines with the given cleanup method."""
147
147
148 for e in engines:
148 for e in engines:
149 if e.poll() is None:
149 if e.poll() is None:
150 print 'Stopping engine, pid',e.pid
150 print 'Stopping engine, pid',e.pid
151 clean(e.pid)
151 clean(e.pid)
152 if controller.poll() is None:
152 if controller.poll() is None:
153 print 'Stopping controller, pid',controller.pid
153 print 'Stopping controller, pid',controller.pid
154 clean(controller.pid)
154 clean(controller.pid)
155
155
156
156
157 def ensureDir(path):
157 def ensureDir(path):
158 """Ensure a directory exists or raise an exception."""
158 """Ensure a directory exists or raise an exception."""
159 if not os.path.isdir(path):
159 if not os.path.isdir(path):
160 os.makedirs(path)
160 os.makedirs(path)
161
161
162
162
163 def startMsg(control_host,control_port=10105):
163 def startMsg(control_host,control_port=10105):
164 """Print a startup message"""
164 """Print a startup message"""
165 print
165 print
166 print 'Your cluster is up and running.'
166 print 'Your cluster is up and running.'
167 print
167 print
168 print 'For interactive use, you can make a MultiEngineClient with:'
168 print 'For interactive use, you can make a MultiEngineClient with:'
169 print
169 print
170 print 'from IPython.kernel import client'
170 print 'from IPython.kernel import client'
171 print "mec = client.MultiEngineClient()"
171 print "mec = client.MultiEngineClient()"
172 print
172 print
173 print 'You can then cleanly stop the cluster from IPython using:'
173 print 'You can then cleanly stop the cluster from IPython using:'
174 print
174 print
175 print 'mec.kill(controller=True)'
175 print 'mec.kill(controller=True)'
176 print
176 print
177
177
178
178
179 def clusterLocal(opt,arg):
179 def clusterLocal(opt,arg):
180 """Start a cluster on the local machine."""
180 """Start a cluster on the local machine."""
181
181
182 # Store all logs inside the ipython directory
182 # Store all logs inside the ipython directory
183 ipdir = get_ipython_dir()
183 ipdir = get_ipython_dir()
184 pjoin = os.path.join
184 pjoin = os.path.join
185
185
186 logfile = opt.logfile
186 logfile = opt.logfile
187 if logfile is None:
187 if logfile is None:
188 logdir_base = pjoin(ipdir,'log')
188 logdir_base = pjoin(ipdir,'log')
189 ensureDir(logdir_base)
189 ensureDir(logdir_base)
190 logfile = pjoin(logdir_base,'ipcluster-')
190 logfile = pjoin(logdir_base,'ipcluster-')
191
191
192 print 'Starting controller:',
192 print 'Starting controller:',
193 controller = Popen(['ipcontroller','--logfile',logfile,'-x','-y'])
193 controller = Popen(['ipcontroller','--logfile',logfile,'-x','-y'])
194 print 'Controller PID:',controller.pid
194 print 'Controller PID:',controller.pid
195
195
196 print 'Starting engines: ',
196 print 'Starting engines: ',
197 time.sleep(5)
197 time.sleep(5)
198
198
199 englogfile = '%s%s-' % (logfile,controller.pid)
199 englogfile = '%s%s-' % (logfile,controller.pid)
200 mpi = opt.mpi
200 mpi = opt.mpi
201 if mpi: # start with mpi - killing the engines with sigterm will not work if you do this
201 if mpi: # start with mpi - killing the engines with sigterm will not work if you do this
202 engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi',
202 engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi',
203 mpi, '--logfile',englogfile])]
203 mpi, '--logfile',englogfile])]
204 # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])]
204 # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])]
205 else: # do what we would normally do
205 else: # do what we would normally do
206 engines = [ Popen(['ipengine','--logfile',englogfile])
206 engines = [ Popen(['ipengine','--logfile',englogfile])
207 for i in range(opt.n) ]
207 for i in range(opt.n) ]
208 eids = [e.pid for e in engines]
208 eids = [e.pid for e in engines]
209 print 'Engines PIDs: ',eids
209 print 'Engines PIDs: ',eids
210 print 'Log files: %s*' % englogfile
210 print 'Log files: %s*' % englogfile
211
211
212 proc_ids = eids + [controller.pid]
212 proc_ids = eids + [controller.pid]
213 procs = engines + [controller]
213 procs = engines + [controller]
214
214
215 grpid = os.getpgrp()
215 grpid = os.getpgrp()
216 try:
216 try:
217 startMsg('127.0.0.1')
217 startMsg('127.0.0.1')
218 print 'You can also hit Ctrl-C to stop it, or use from the cmd line:'
218 print 'You can also hit Ctrl-C to stop it, or use from the cmd line:'
219 print
219 print
220 print 'kill -INT',grpid
220 print 'kill -INT',grpid
221 print
221 print
222 try:
222 try:
223 while True:
223 while True:
224 time.sleep(5)
224 time.sleep(5)
225 except:
225 except:
226 pass
226 pass
227 finally:
227 finally:
228 print 'Stopping cluster. Cleaning up...'
228 print 'Stopping cluster. Cleaning up...'
229 cleanup(stop,controller,engines)
229 cleanup(stop,controller,engines)
230 for i in range(4):
230 for i in range(4):
231 time.sleep(i+2)
231 time.sleep(i+2)
232 nZombies = numAlive(controller,engines)
232 nZombies = numAlive(controller,engines)
233 if nZombies== 0:
233 if nZombies== 0:
234 print 'OK: All processes cleaned up.'
234 print 'OK: All processes cleaned up.'
235 break
235 break
236 print 'Trying again, %d processes did not stop...' % nZombies
236 print 'Trying again, %d processes did not stop...' % nZombies
237 cleanup(kill,controller,engines)
237 cleanup(kill,controller,engines)
238 if numAlive(controller,engines) == 0:
238 if numAlive(controller,engines) == 0:
239 print 'OK: All processes cleaned up.'
239 print 'OK: All processes cleaned up.'
240 break
240 break
241 else:
241 else:
242 print '*'*75
242 print '*'*75
243 print 'ERROR: could not kill some processes, try to do it',
243 print 'ERROR: could not kill some processes, try to do it',
244 print 'manually.'
244 print 'manually.'
245 zombies = []
245 zombies = []
246 if controller.returncode is None:
246 if controller.returncode is None:
247 print 'Controller is alive: pid =',controller.pid
247 print 'Controller is alive: pid =',controller.pid
248 zombies.append(controller.pid)
248 zombies.append(controller.pid)
249 liveEngines = [ e for e in engines if e.returncode is None ]
249 liveEngines = [ e for e in engines if e.returncode is None ]
250 for e in liveEngines:
250 for e in liveEngines:
251 print 'Engine is alive: pid =',e.pid
251 print 'Engine is alive: pid =',e.pid
252 zombies.append(e.pid)
252 zombies.append(e.pid)
253 print
253 print
254 print 'Zombie summary:',' '.join(map(str,zombies))
254 print 'Zombie summary:',' '.join(map(str,zombies))
255
255
256 def clusterRemote(opt,arg):
256 def clusterRemote(opt,arg):
257 """Start a remote cluster over SSH"""
257 """Start a remote cluster over SSH"""
258
258
259 # B. Granger, 9/3/08
260 # The launching of a remote cluster using SSH and a clusterfile
261 # is broken. Because it won't be fixed before the 0.9 release,
262 # we are removing it. For now, we just print a message to the
263 # user and abort.
264
265 print """The launching of a remote IPython cluster using SSL
266 and a clusterfile has been removed in this release.
267 It has been broken for a while and we are in the process
268 of building a new process management system that will be
269 used to provide a more robust way of starting an IPython
270 cluster.
271
272 For now remote clusters have to be launched using ipcontroller
273 and ipengine separately.
274 """
275 sys.exit(1)
276
259 # Load the remote cluster configuration
277 # Load the remote cluster configuration
260 clConfig = {}
278 clConfig = {}
261 execfile(opt.clusterfile,clConfig)
279 execfile(opt.clusterfile,clConfig)
262 contConfig = clConfig['controller']
280 contConfig = clConfig['controller']
263 engConfig = clConfig['engines']
281 engConfig = clConfig['engines']
264 # Determine where to find sshx:
282 # Determine where to find sshx:
265 sshx = clConfig.get('sshx',os.environ.get('IPYTHON_SSHX','sshx'))
283 sshx = clConfig.get('sshx',os.environ.get('IPYTHON_SSHX','sshx'))
266
284
267 # Store all logs inside the ipython directory
285 # Store all logs inside the ipython directory
268 ipdir = get_ipython_dir()
286 ipdir = get_ipython_dir()
269 pjoin = os.path.join
287 pjoin = os.path.join
270
288
271 logfile = opt.logfile
289 logfile = opt.logfile
272 if logfile is None:
290 if logfile is None:
273 logdir_base = pjoin(ipdir,'log')
291 logdir_base = pjoin(ipdir,'log')
274 ensureDir(logdir_base)
292 ensureDir(logdir_base)
275 logfile = pjoin(logdir_base,'ipcluster')
293 logfile = pjoin(logdir_base,'ipcluster')
276
294
277 # Append this script's PID to the logfile name always
295 # Append this script's PID to the logfile name always
278 logfile = '%s-%s' % (logfile,os.getpid())
296 logfile = '%s-%s' % (logfile,os.getpid())
279
297
280 print 'Starting controller:'
298 print 'Starting controller:'
281 # Controller data:
299 # Controller data:
282 xsys = os.system
300 xsys = os.system
283
301
284 contHost = contConfig['host']
302 contHost = contConfig['host']
285 contLog = '%s-con-%s-' % (logfile,contHost)
303 contLog = '%s-con-%s-' % (logfile,contHost)
286 cmd = "ssh %s '%s' 'ipcontroller --logfile %s' &" % \
304 cmd = "ssh %s '%s' 'ipcontroller --logfile %s' &" % \
287 (contHost,sshx,contLog)
305 (contHost,sshx,contLog)
288 #print 'cmd:<%s>' % cmd # dbg
306 #print 'cmd:<%s>' % cmd # dbg
289 xsys(cmd)
307 xsys(cmd)
290 time.sleep(2)
308 time.sleep(2)
291
309
292 print 'Starting engines: '
310 print 'Starting engines: '
293 for engineHost,engineData in engConfig.iteritems():
311 for engineHost,engineData in engConfig.iteritems():
294 if isinstance(engineData,int):
312 if isinstance(engineData,int):
295 numEngines = engineData
313 numEngines = engineData
296 else:
314 else:
297 raise NotImplementedError('port configuration not finished for engines')
315 raise NotImplementedError('port configuration not finished for engines')
298
316
299 print 'Sarting %d engines on %s' % (numEngines,engineHost)
317 print 'Sarting %d engines on %s' % (numEngines,engineHost)
300 engLog = '%s-eng-%s-' % (logfile,engineHost)
318 engLog = '%s-eng-%s-' % (logfile,engineHost)
301 for i in range(numEngines):
319 for i in range(numEngines):
302 cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \
320 cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \
303 (engineHost,sshx,contHost,engLog)
321 (engineHost,sshx,contHost,engLog)
304 #print 'cmd:<%s>' % cmd # dbg
322 #print 'cmd:<%s>' % cmd # dbg
305 xsys(cmd)
323 xsys(cmd)
306 # Wait after each host a little bit
324 # Wait after each host a little bit
307 time.sleep(1)
325 time.sleep(1)
308
326
309 startMsg(contConfig['host'])
327 startMsg(contConfig['host'])
310
328
311 def main():
329 def main():
312 """Main driver for the two big options: local or remote cluster."""
330 """Main driver for the two big options: local or remote cluster."""
313
331
314 opt,arg = parse_args()
332 opt,arg = parse_args()
315
333
316 clusterfile = opt.clusterfile
334 clusterfile = opt.clusterfile
317 if clusterfile:
335 if clusterfile:
318 clusterRemote(opt,arg)
336 clusterRemote(opt,arg)
319 else:
337 else:
320 clusterLocal(opt,arg)
338 clusterLocal(opt,arg)
321
339
322
340
323 if __name__=='__main__':
341 if __name__=='__main__':
324 main()
342 main()
General Comments 0
You need to be logged in to leave comments. Login now