##// END OF EJS Templates
Removed the -f option to ipcluster. The remote starting of an IPython...
Brian Granger -
Show More
@@ -1,324 +1,342 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3
4 4 """Start an IPython cluster conveniently, either locally or remotely.
5 5
6 6 Basic usage
7 7 -----------
8 8
9 9 For local operation, the simplest mode of usage is:
10 10
11 11 %prog -n N
12 12
13 13 where N is the number of engines you want started.
14 14
15 15 For remote operation, you must call it with a cluster description file:
16 16
17 17 %prog -f clusterfile.py
18 18
19 19 The cluster file is a normal Python script which gets run via execfile(). You
20 20 can have arbitrary logic in it, but all that matters is that at the end of the
21 21 execution, it declares the variables 'controller', 'engines', and optionally
22 22 'sshx'. See the accompanying examples for details on what these variables must
23 23 contain.
24 24
25 25
26 26 Notes
27 27 -----
28 28
29 29 WARNING: this code is still UNFINISHED and EXPERIMENTAL! It is incomplete,
30 30 some listed options are not really implemented, and all of its interfaces are
31 31 subject to change.
32 32
33 33 When operating over SSH for a remote cluster, this program relies on the
34 34 existence of a particular script called 'sshx'. This script must live in the
35 35 target systems where you'll be running your controller and engines, and is
36 36 needed to configure your PATH and PYTHONPATH variables for further execution of
37 37 python code at the other end of an SSH connection. The script can be as simple
38 38 as:
39 39
40 40 #!/bin/sh
41 41 . $HOME/.bashrc
42 42 "$@"
43 43
44 44 which is the default one provided by IPython. You can modify this or provide
45 45 your own. Since it's quite likely that for different clusters you may need
46 46 this script to configure things differently or that it may live in different
47 47 locations, its full path can be set in the same file where you define the
48 48 cluster setup. IPython's order of evaluation for this variable is the
49 49 following:
50 50
51 51 a) Internal default: 'sshx'. This only works if it is in the default system
52 52 path which SSH sets up in non-interactive mode.
53 53
54 54 b) Environment variable: if $IPYTHON_SSHX is defined, this overrides the
55 55 internal default.
56 56
57 57 c) Variable 'sshx' in the cluster configuration file: finally, this will
58 58 override the previous two values.
59 59
60 60 This code is Unix-only, with precious little hope of any of this ever working
61 61 under Windows, since we need SSH from the ground up, we background processes,
62 62 etc. Ports of this functionality to Windows are welcome.
63 63
64 64
65 65 Call summary
66 66 ------------
67 67
68 68 %prog [options]
69 69 """
70 70
71 71 __docformat__ = "restructuredtext en"
72 72
73 73 #-------------------------------------------------------------------------------
74 74 # Copyright (C) 2008 The IPython Development Team
75 75 #
76 76 # Distributed under the terms of the BSD License. The full license is in
77 77 # the file COPYING, distributed as part of this software.
78 78 #-------------------------------------------------------------------------------
79 79
80 80 #-------------------------------------------------------------------------------
81 81 # Stdlib imports
82 82 #-------------------------------------------------------------------------------
83 83
84 84 import os
85 85 import signal
86 86 import sys
87 87 import time
88 88
89 89 from optparse import OptionParser
90 90 from subprocess import Popen,call
91 91
92 92 #---------------------------------------------------------------------------
93 93 # IPython imports
94 94 #---------------------------------------------------------------------------
95 95 from IPython.tools import utils
96 96 from IPython.genutils import get_ipython_dir
97 97
98 98 #---------------------------------------------------------------------------
99 99 # Normal code begins
100 100 #---------------------------------------------------------------------------
101 101
102 102 def parse_args():
103 103 """Parse command line and return opts,args."""
104 104
105 105 parser = OptionParser(usage=__doc__)
106 106 newopt = parser.add_option # shorthand
107 107
108 108 newopt("--controller-port", type="int", dest="controllerport",
109 109 help="the TCP port the controller is listening on")
110 110
111 111 newopt("--controller-ip", type="string", dest="controllerip",
112 112 help="the TCP ip address of the controller")
113 113
114 114 newopt("-n", "--num", type="int", dest="n",default=2,
115 115 help="the number of engines to start")
116 116
117 117 newopt("--engine-port", type="int", dest="engineport",
118 118 help="the TCP port the controller will listen on for engine "
119 119 "connections")
120 120
121 121 newopt("--engine-ip", type="string", dest="engineip",
122 122 help="the TCP ip address the controller will listen on "
123 123 "for engine connections")
124 124
125 125 newopt("--mpi", type="string", dest="mpi",
126 126 help="use mpi with package: for instance --mpi=mpi4py")
127 127
128 128 newopt("-l", "--logfile", type="string", dest="logfile",
129 129 help="log file name")
130 130
131 131 newopt('-f','--cluster-file',dest='clusterfile',
132 132 help='file describing a remote cluster')
133 133
134 134 return parser.parse_args()
135 135
136 136 def numAlive(controller,engines):
137 137 """Return the number of processes still alive."""
138 138 retcodes = [controller.poll()] + \
139 139 [e.poll() for e in engines]
140 140 return retcodes.count(None)
141 141
142 142 stop = lambda pid: os.kill(pid,signal.SIGINT)
143 143 kill = lambda pid: os.kill(pid,signal.SIGTERM)
144 144
145 145 def cleanup(clean,controller,engines):
146 146 """Stop the controller and engines with the given cleanup method."""
147 147
148 148 for e in engines:
149 149 if e.poll() is None:
150 150 print 'Stopping engine, pid',e.pid
151 151 clean(e.pid)
152 152 if controller.poll() is None:
153 153 print 'Stopping controller, pid',controller.pid
154 154 clean(controller.pid)
155 155
156 156
157 157 def ensureDir(path):
158 158 """Ensure a directory exists or raise an exception."""
159 159 if not os.path.isdir(path):
160 160 os.makedirs(path)
161 161
162 162
163 163 def startMsg(control_host,control_port=10105):
164 164 """Print a startup message"""
165 165 print
166 166 print 'Your cluster is up and running.'
167 167 print
168 168 print 'For interactive use, you can make a MultiEngineClient with:'
169 169 print
170 170 print 'from IPython.kernel import client'
171 171 print "mec = client.MultiEngineClient()"
172 172 print
173 173 print 'You can then cleanly stop the cluster from IPython using:'
174 174 print
175 175 print 'mec.kill(controller=True)'
176 176 print
177 177
178 178
179 179 def clusterLocal(opt,arg):
180 180 """Start a cluster on the local machine."""
181 181
182 182 # Store all logs inside the ipython directory
183 183 ipdir = get_ipython_dir()
184 184 pjoin = os.path.join
185 185
186 186 logfile = opt.logfile
187 187 if logfile is None:
188 188 logdir_base = pjoin(ipdir,'log')
189 189 ensureDir(logdir_base)
190 190 logfile = pjoin(logdir_base,'ipcluster-')
191 191
192 192 print 'Starting controller:',
193 193 controller = Popen(['ipcontroller','--logfile',logfile,'-x','-y'])
194 194 print 'Controller PID:',controller.pid
195 195
196 196 print 'Starting engines: ',
197 197 time.sleep(5)
198 198
199 199 englogfile = '%s%s-' % (logfile,controller.pid)
200 200 mpi = opt.mpi
201 201 if mpi: # start with mpi - killing the engines with sigterm will not work if you do this
202 202 engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi',
203 203 mpi, '--logfile',englogfile])]
204 204 # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])]
205 205 else: # do what we would normally do
206 206 engines = [ Popen(['ipengine','--logfile',englogfile])
207 207 for i in range(opt.n) ]
208 208 eids = [e.pid for e in engines]
209 209 print 'Engines PIDs: ',eids
210 210 print 'Log files: %s*' % englogfile
211 211
212 212 proc_ids = eids + [controller.pid]
213 213 procs = engines + [controller]
214 214
215 215 grpid = os.getpgrp()
216 216 try:
217 217 startMsg('127.0.0.1')
218 218 print 'You can also hit Ctrl-C to stop it, or use from the cmd line:'
219 219 print
220 220 print 'kill -INT',grpid
221 221 print
222 222 try:
223 223 while True:
224 224 time.sleep(5)
225 225 except:
226 226 pass
227 227 finally:
228 228 print 'Stopping cluster. Cleaning up...'
229 229 cleanup(stop,controller,engines)
230 230 for i in range(4):
231 231 time.sleep(i+2)
232 232 nZombies = numAlive(controller,engines)
233 233 if nZombies== 0:
234 234 print 'OK: All processes cleaned up.'
235 235 break
236 236 print 'Trying again, %d processes did not stop...' % nZombies
237 237 cleanup(kill,controller,engines)
238 238 if numAlive(controller,engines) == 0:
239 239 print 'OK: All processes cleaned up.'
240 240 break
241 241 else:
242 242 print '*'*75
243 243 print 'ERROR: could not kill some processes, try to do it',
244 244 print 'manually.'
245 245 zombies = []
246 246 if controller.returncode is None:
247 247 print 'Controller is alive: pid =',controller.pid
248 248 zombies.append(controller.pid)
249 249 liveEngines = [ e for e in engines if e.returncode is None ]
250 250 for e in liveEngines:
251 251 print 'Engine is alive: pid =',e.pid
252 252 zombies.append(e.pid)
253 253 print
254 254 print 'Zombie summary:',' '.join(map(str,zombies))
255 255
256 256 def clusterRemote(opt,arg):
257 257 """Start a remote cluster over SSH"""
258 258
259 # B. Granger, 9/3/08
260 # The launching of a remote cluster using SSH and a clusterfile
261 # is broken. Because it won't be fixed before the 0.9 release,
262 # we are removing it. For now, we just print a message to the
263 # user and abort.
264
265 print """The launching of a remote IPython cluster using SSL
266 and a clusterfile has been removed in this release.
267 It has been broken for a while and we are in the process
268 of building a new process management system that will be
269 used to provide a more robust way of starting an IPython
270 cluster.
271
272 For now remote clusters have to be launched using ipcontroller
273 and ipengine separately.
274 """
275 sys.exit(1)
276
259 277 # Load the remote cluster configuration
260 278 clConfig = {}
261 279 execfile(opt.clusterfile,clConfig)
262 280 contConfig = clConfig['controller']
263 281 engConfig = clConfig['engines']
264 282 # Determine where to find sshx:
265 283 sshx = clConfig.get('sshx',os.environ.get('IPYTHON_SSHX','sshx'))
266 284
267 285 # Store all logs inside the ipython directory
268 286 ipdir = get_ipython_dir()
269 287 pjoin = os.path.join
270 288
271 289 logfile = opt.logfile
272 290 if logfile is None:
273 291 logdir_base = pjoin(ipdir,'log')
274 292 ensureDir(logdir_base)
275 293 logfile = pjoin(logdir_base,'ipcluster')
276 294
277 295 # Append this script's PID to the logfile name always
278 296 logfile = '%s-%s' % (logfile,os.getpid())
279 297
280 298 print 'Starting controller:'
281 299 # Controller data:
282 300 xsys = os.system
283 301
284 302 contHost = contConfig['host']
285 303 contLog = '%s-con-%s-' % (logfile,contHost)
286 304 cmd = "ssh %s '%s' 'ipcontroller --logfile %s' &" % \
287 305 (contHost,sshx,contLog)
288 306 #print 'cmd:<%s>' % cmd # dbg
289 307 xsys(cmd)
290 308 time.sleep(2)
291 309
292 310 print 'Starting engines: '
293 311 for engineHost,engineData in engConfig.iteritems():
294 312 if isinstance(engineData,int):
295 313 numEngines = engineData
296 314 else:
297 315 raise NotImplementedError('port configuration not finished for engines')
298 316
299 317 print 'Sarting %d engines on %s' % (numEngines,engineHost)
300 318 engLog = '%s-eng-%s-' % (logfile,engineHost)
301 319 for i in range(numEngines):
302 320 cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \
303 321 (engineHost,sshx,contHost,engLog)
304 322 #print 'cmd:<%s>' % cmd # dbg
305 323 xsys(cmd)
306 324 # Wait after each host a little bit
307 325 time.sleep(1)
308 326
309 327 startMsg(contConfig['host'])
310 328
311 329 def main():
312 330 """Main driver for the two big options: local or remote cluster."""
313 331
314 332 opt,arg = parse_args()
315 333
316 334 clusterfile = opt.clusterfile
317 335 if clusterfile:
318 336 clusterRemote(opt,arg)
319 337 else:
320 338 clusterLocal(opt,arg)
321 339
322 340
323 341 if __name__=='__main__':
324 342 main()
General Comments 0
You need to be logged in to leave comments. Login now