Show More
@@ -18,6 +18,7 b' import os' | |||||
18 | import re |
|
18 | import re | |
19 | import sys |
|
19 | import sys | |
20 | import signal |
|
20 | import signal | |
|
21 | import stat | |||
21 | import tempfile |
|
22 | import tempfile | |
22 | pjoin = os.path.join |
|
23 | pjoin = os.path.join | |
23 |
|
24 | |||
@@ -234,6 +235,7 b' class LocalEngineSet(object):' | |||||
234 | def start(self, n): |
|
235 | def start(self, n): | |
235 | dlist = [] |
|
236 | dlist = [] | |
236 | for i in range(n): |
|
237 | for i in range(n): | |
|
238 | print "starting engine:", i | |||
237 | el = EngineLauncher(extra_args=self.extra_args) |
|
239 | el = EngineLauncher(extra_args=self.extra_args) | |
238 | d = el.start() |
|
240 | d = el.start() | |
239 | self.launchers.append(el) |
|
241 | self.launchers.append(el) | |
@@ -270,22 +272,25 b' class LocalEngineSet(object):' | |||||
270 | dfinal.addCallback(self._handle_stop) |
|
272 | dfinal.addCallback(self._handle_stop) | |
271 | return dfinal |
|
273 | return dfinal | |
272 |
|
274 | |||
273 |
|
||||
274 | class BatchEngineSet(object): |
|
275 | class BatchEngineSet(object): | |
275 |
|
276 | |||
276 | # Subclasses must fill these in. See PBSEngineSet |
|
277 | # Subclasses must fill these in. See PBSEngineSet/SGEEngineSet | |
|
278 | name = '' | |||
277 | submit_command = '' |
|
279 | submit_command = '' | |
278 | delete_command = '' |
|
280 | delete_command = '' | |
279 | job_id_regexp = '' |
|
281 | job_id_regexp = '' | |
280 |
|
282 | job_array_regexp = '' | ||
281 | def __init__(self, template_file, **kwargs): |
|
283 | job_array_template = '' | |
|
284 | queue_regexp = '' | |||
|
285 | queue_template = '' | |||
|
286 | default_template = '' | |||
|
287 | ||||
|
288 | def __init__(self, template_file, queue, **kwargs): | |||
282 | self.template_file = template_file |
|
289 | self.template_file = template_file | |
283 |
self. |
|
290 | self.queue = queue | |
284 | self.context.update(kwargs) |
|
291 | ||
285 | self.batch_file = self.template_file+'-run' |
|
|||
286 |
|
||||
287 | def parse_job_id(self, output): |
|
292 | def parse_job_id(self, output): | |
288 |
m = re. |
|
293 | m = re.search(self.job_id_regexp, output) | |
289 | if m is not None: |
|
294 | if m is not None: | |
290 | job_id = m.group() |
|
295 | job_id = m.group() | |
291 | else: |
|
296 | else: | |
@@ -293,46 +298,120 b' class BatchEngineSet(object):' | |||||
293 | self.job_id = job_id |
|
298 | self.job_id = job_id | |
294 | log.msg('Job started with job id: %r' % job_id) |
|
299 | log.msg('Job started with job id: %r' % job_id) | |
295 | return job_id |
|
300 | return job_id | |
296 |
|
301 | |||
297 | def write_batch_script(self, n): |
|
|||
298 | self.context['n'] = n |
|
|||
299 | template = open(self.template_file, 'r').read() |
|
|||
300 | log.msg('Using template for batch script: %s' % self.template_file) |
|
|||
301 | script_as_string = Itpl.itplns(template, self.context) |
|
|||
302 | log.msg('Writing instantiated batch script: %s' % self.batch_file) |
|
|||
303 | f = open(self.batch_file,'w') |
|
|||
304 | f.write(script_as_string) |
|
|||
305 | f.close() |
|
|||
306 |
|
||||
307 | def handle_error(self, f): |
|
302 | def handle_error(self, f): | |
308 | f.printTraceback() |
|
303 | f.printTraceback() | |
309 | f.raiseException() |
|
304 | f.raiseException() | |
310 |
|
305 | |||
311 | def start(self, n): |
|
306 | def start(self, n): | |
312 | self.write_batch_script(n) |
|
307 | log.msg("starting %d engines" % n) | |
|
308 | self._temp_file = tempfile.NamedTemporaryFile() | |||
|
309 | os.chmod(self._temp_file.name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) | |||
|
310 | if self.template_file: | |||
|
311 | log.msg("Using %s script %s" % (self.name, self.template_file)) | |||
|
312 | contents = open(self.template_file, 'r').read() | |||
|
313 | new_script = contents | |||
|
314 | regex = re.compile(self.job_array_regexp) | |||
|
315 | if not regex.search(contents): | |||
|
316 | log.msg("adding job array settings to %s script" % self.name) | |||
|
317 | new_script = self.job_array_template % n +'\n' + new_script | |||
|
318 | print self.queue_regexp | |||
|
319 | regex = re.compile(self.queue_regexp) | |||
|
320 | print regex.search(contents) | |||
|
321 | if self.queue and not regex.search(contents): | |||
|
322 | log.msg("adding queue settings to %s script" % self.name) | |||
|
323 | new_script = self.queue_template % self.queue + '\n' + new_script | |||
|
324 | if new_script != contents: | |||
|
325 | self._temp_file.write(new_script) | |||
|
326 | self.template_file = self._temp_file.name | |||
|
327 | else: | |||
|
328 | default_script = self.default_template % n | |||
|
329 | if self.queue: | |||
|
330 | default_script = self.queue_template % self.queue + \ | |||
|
331 | '\n' + default_script | |||
|
332 | log.msg("using default ipengine %s script: \n%s" % | |||
|
333 | (self.name, default_script)) | |||
|
334 | self._temp_file.file.write(default_script) | |||
|
335 | self.template_file = self._temp_file.name | |||
|
336 | self._temp_file.file.close() | |||
313 | d = getProcessOutput(self.submit_command, |
|
337 | d = getProcessOutput(self.submit_command, | |
314 | [self.batch_file],env=os.environ) |
|
338 | [self.template_file], | |
|
339 | env=os.environ) | |||
315 | d.addCallback(self.parse_job_id) |
|
340 | d.addCallback(self.parse_job_id) | |
316 | d.addErrback(self.handle_error) |
|
341 | d.addErrback(self.handle_error) | |
317 | return d |
|
342 | return d | |
318 |
|
343 | |||
319 | def kill(self): |
|
344 | def kill(self): | |
320 | d = getProcessOutput(self.delete_command, |
|
345 | d = getProcessOutput(self.delete_command, | |
321 | [self.job_id],env=os.environ) |
|
346 | [self.job_id],env=os.environ) | |
322 | return d |
|
347 | return d | |
323 |
|
348 | |||
324 | class PBSEngineSet(BatchEngineSet): |
|
349 | class PBSEngineSet(BatchEngineSet): | |
325 |
|
350 | |||
|
351 | name = 'PBS' | |||
326 | submit_command = 'qsub' |
|
352 | submit_command = 'qsub' | |
327 | delete_command = 'qdel' |
|
353 | delete_command = 'qdel' | |
328 | job_id_regexp = '\d+' |
|
354 | job_id_regexp = '\d+' | |
329 |
|
355 | job_array_regexp = '#PBS[ \t]+-t[ \t]+\d+' | ||
330 | def __init__(self, template_file, **kwargs): |
|
356 | job_array_template = '#PBS -t 1-%d' | |
331 | BatchEngineSet.__init__(self, template_file, **kwargs) |
|
357 | queue_regexp = '#PBS[ \t]+-q[ \t]+\w+' | |
|
358 | queue_template = '#PBS -q %s' | |||
|
359 | default_template="""#!/bin/sh | |||
|
360 | #PBS -V | |||
|
361 | #PBS -t 1-%d | |||
|
362 | #PBS -N ipengine | |||
|
363 | eid=$(($PBS_ARRAYID - 1)) | |||
|
364 | ipengine --logfile=ipengine${eid}.log | |||
|
365 | """ | |||
|
366 | ||||
|
367 | class SGEEngineSet(PBSEngineSet): | |||
|
368 | ||||
|
369 | name = 'SGE' | |||
|
370 | job_array_regexp = '#\$[ \t]+-t[ \t]+\d+' | |||
|
371 | job_array_template = '#$ -t 1-%d' | |||
|
372 | queue_regexp = '#\$[ \t]+-q[ \t]+\w+' | |||
|
373 | queue_template = '#$ -q %s' | |||
|
374 | default_template="""#$ -V | |||
|
375 | #$ -S /bin/sh | |||
|
376 | #$ -t 1-%d | |||
|
377 | #$ -N ipengine | |||
|
378 | eid=$(($SGE_TASK_ID - 1)) | |||
|
379 | ipengine --logfile=ipengine${eid}.log | |||
|
380 | """ | |||
|
381 | ||||
|
382 | class LSFEngineSet(PBSEngineSet): | |||
|
383 | ||||
|
384 | name = 'LSF' | |||
|
385 | submit_command = 'bsub' | |||
|
386 | delete_command = 'bkill' | |||
|
387 | job_array_regexp = '#BSUB[ \t]-J+\w+\[\d+-\d+\]' | |||
|
388 | job_array_template = '#BSUB -J ipengine[1-%d]' | |||
|
389 | queue_regexp = '#BSUB[ \t]+-q[ \t]+\w+' | |||
|
390 | queue_template = '#BSUB -q %s' | |||
|
391 | default_template="""#!/bin/sh | |||
|
392 | #BSUB -J ipengine[1-%d] | |||
|
393 | eid=$(($LSB_JOBINDEX - 1)) | |||
|
394 | ipengine --logfile=ipengine${eid}.log | |||
|
395 | """ | |||
|
396 | bsub_wrapper="""#!/bin/sh | |||
|
397 | bsub < $1 | |||
|
398 | """ | |||
|
399 | ||||
|
400 | def __init__(self, template_file, queue, **kwargs): | |||
|
401 | self._bsub_wrapper = self._make_bsub_wrapper() | |||
|
402 | self.submit_command = self._bsub_wrapper.name | |||
|
403 | PBSEngineSet.__init__(self,template_file, queue, **kwargs) | |||
332 |
|
404 | |||
|
405 | def _make_bsub_wrapper(self): | |||
|
406 | bsub_wrapper = tempfile.NamedTemporaryFile() | |||
|
407 | bsub_wrapper.write(self.bsub_wrapper) | |||
|
408 | bsub_wrapper.file.close() | |||
|
409 | os.chmod(bsub_wrapper.name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) | |||
|
410 | return bsub_wrapper | |||
333 |
|
411 | |||
334 | sshx_template="""#!/bin/sh |
|
412 | sshx_template_prefix="""#!/bin/sh | |
335 | "$@" &> /dev/null & |
|
413 | """ | |
|
414 | sshx_template_suffix=""""$@" &> /dev/null & | |||
336 | echo $! |
|
415 | echo $! | |
337 | """ |
|
416 | """ | |
338 |
|
417 | |||
@@ -340,11 +419,19 b' engine_killer_template="""#!/bin/sh' | |||||
340 | ps -fu `whoami` | grep '[i]pengine' | awk '{print $2}' | xargs kill -TERM |
|
419 | ps -fu `whoami` | grep '[i]pengine' | awk '{print $2}' | xargs kill -TERM | |
341 | """ |
|
420 | """ | |
342 |
|
421 | |||
|
422 | def escape_strings(val): | |||
|
423 | val = val.replace('(','\(') | |||
|
424 | val = val.replace(')','\)') | |||
|
425 | if ' ' in val: | |||
|
426 | val = '"%s"'%val | |||
|
427 | return val | |||
|
428 | ||||
343 | class SSHEngineSet(object): |
|
429 | class SSHEngineSet(object): | |
344 | sshx_template=sshx_template |
|
430 | sshx_template_prefix=sshx_template_prefix | |
|
431 | sshx_template_suffix=sshx_template_suffix | |||
345 | engine_killer_template=engine_killer_template |
|
432 | engine_killer_template=engine_killer_template | |
346 |
|
433 | |||
347 | def __init__(self, engine_hosts, sshx=None, ipengine="ipengine"): |
|
434 | def __init__(self, engine_hosts, sshx=None, copyenvs=None, ipengine="ipengine"): | |
348 | """Start a controller on localhost and engines using ssh. |
|
435 | """Start a controller on localhost and engines using ssh. | |
349 |
|
436 | |||
350 | The engine_hosts argument is a dict with hostnames as keys and |
|
437 | The engine_hosts argument is a dict with hostnames as keys and | |
@@ -363,7 +450,12 b' class SSHEngineSet(object):' | |||||
363 | '%s-main-sshx.sh' % os.environ['USER'] |
|
450 | '%s-main-sshx.sh' % os.environ['USER'] | |
364 | ) |
|
451 | ) | |
365 | f = open(self.sshx, 'w') |
|
452 | f = open(self.sshx, 'w') | |
366 | f.writelines(self.sshx_template) |
|
453 | f.writelines(self.sshx_template_prefix) | |
|
454 | if copyenvs: | |||
|
455 | for key, val in sorted(os.environ.items()): | |||
|
456 | newval = escape_strings(val) | |||
|
457 | f.writelines('export %s=%s\n'%(key,newval)) | |||
|
458 | f.writelines(self.sshx_template_suffix) | |||
367 | f.close() |
|
459 | f.close() | |
368 | self.engine_command = ipengine |
|
460 | self.engine_command = ipengine | |
369 | self.engine_hosts = engine_hosts |
|
461 | self.engine_hosts = engine_hosts | |
@@ -609,13 +701,17 b' def main_pbs(args):' | |||||
609 | # See if we are reusing FURL files |
|
701 | # See if we are reusing FURL files | |
610 | if not check_reuse(args, cont_args): |
|
702 | if not check_reuse(args, cont_args): | |
611 | return |
|
703 | return | |
|
704 | ||||
|
705 | if args.pbsscript and not os.path.isfile(args.pbsscript): | |||
|
706 | log.err('PBS script does not exist: %s' % args.pbsscript) | |||
|
707 | return | |||
612 |
|
708 | |||
613 | cl = ControllerLauncher(extra_args=cont_args) |
|
709 | cl = ControllerLauncher(extra_args=cont_args) | |
614 | dstart = cl.start() |
|
710 | dstart = cl.start() | |
615 | def start_engines(r): |
|
711 | def start_engines(r): | |
616 | pbs_set = PBSEngineSet(args.pbsscript) |
|
712 | pbs_set = PBSEngineSet(args.pbsscript, args.pbsqueue) | |
617 | def shutdown(signum, frame): |
|
713 | def shutdown(signum, frame): | |
618 |
log.msg('Stopping |
|
714 | log.msg('Stopping PBS cluster') | |
619 | d = pbs_set.kill() |
|
715 | d = pbs_set.kill() | |
620 | d.addBoth(lambda _: cl.interrupt_then_kill(1.0)) |
|
716 | d.addBoth(lambda _: cl.interrupt_then_kill(1.0)) | |
621 | d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop)) |
|
717 | d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop)) | |
@@ -627,6 +723,72 b' def main_pbs(args):' | |||||
627 | dstart.addCallback(_delay_start, start_engines, furl_file, args.r) |
|
723 | dstart.addCallback(_delay_start, start_engines, furl_file, args.r) | |
628 | dstart.addErrback(_err_and_stop) |
|
724 | dstart.addErrback(_err_and_stop) | |
629 |
|
725 | |||
|
726 | def main_sge(args): | |||
|
727 | cont_args = [] | |||
|
728 | cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller')) | |||
|
729 | ||||
|
730 | # Check security settings before proceeding | |||
|
731 | if not check_security(args, cont_args): | |||
|
732 | return | |||
|
733 | ||||
|
734 | # See if we are reusing FURL files | |||
|
735 | if not check_reuse(args, cont_args): | |||
|
736 | return | |||
|
737 | ||||
|
738 | if args.sgescript and not os.path.isfile(args.sgescript): | |||
|
739 | log.err('SGE script does not exist: %s' % args.sgescript) | |||
|
740 | return | |||
|
741 | ||||
|
742 | cl = ControllerLauncher(extra_args=cont_args) | |||
|
743 | dstart = cl.start() | |||
|
744 | def start_engines(r): | |||
|
745 | sge_set = SGEEngineSet(args.sgescript, args.sgequeue) | |||
|
746 | def shutdown(signum, frame): | |||
|
747 | log.msg('Stopping sge cluster') | |||
|
748 | d = sge_set.kill() | |||
|
749 | d.addBoth(lambda _: cl.interrupt_then_kill(1.0)) | |||
|
750 | d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop)) | |||
|
751 | signal.signal(signal.SIGINT,shutdown) | |||
|
752 | d = sge_set.start(args.n) | |||
|
753 | return d | |||
|
754 | config = kernel_config_manager.get_config_obj() | |||
|
755 | furl_file = config['controller']['engine_furl_file'] | |||
|
756 | dstart.addCallback(_delay_start, start_engines, furl_file, args.r) | |||
|
757 | dstart.addErrback(_err_and_stop) | |||
|
758 | ||||
|
759 | def main_lsf(args): | |||
|
760 | cont_args = [] | |||
|
761 | cont_args.append('--logfile=%s' % pjoin(args.logdir,'ipcontroller')) | |||
|
762 | ||||
|
763 | # Check security settings before proceeding | |||
|
764 | if not check_security(args, cont_args): | |||
|
765 | return | |||
|
766 | ||||
|
767 | # See if we are reusing FURL files | |||
|
768 | if not check_reuse(args, cont_args): | |||
|
769 | return | |||
|
770 | ||||
|
771 | if args.lsfscript and not os.path.isfile(args.lsfscript): | |||
|
772 | log.err('LSF script does not exist: %s' % args.lsfscript) | |||
|
773 | return | |||
|
774 | ||||
|
775 | cl = ControllerLauncher(extra_args=cont_args) | |||
|
776 | dstart = cl.start() | |||
|
777 | def start_engines(r): | |||
|
778 | lsf_set = LSFEngineSet(args.lsfscript, args.lsfqueue) | |||
|
779 | def shutdown(signum, frame): | |||
|
780 | log.msg('Stopping LSF cluster') | |||
|
781 | d = lsf_set.kill() | |||
|
782 | d.addBoth(lambda _: cl.interrupt_then_kill(1.0)) | |||
|
783 | d.addBoth(lambda _: reactor.callLater(2.0, reactor.stop)) | |||
|
784 | signal.signal(signal.SIGINT,shutdown) | |||
|
785 | d = lsf_set.start(args.n) | |||
|
786 | return d | |||
|
787 | config = kernel_config_manager.get_config_obj() | |||
|
788 | furl_file = config['controller']['engine_furl_file'] | |||
|
789 | dstart.addCallback(_delay_start, start_engines, furl_file, args.r) | |||
|
790 | dstart.addErrback(_err_and_stop) | |||
|
791 | ||||
630 |
|
792 | |||
631 | def main_ssh(args): |
|
793 | def main_ssh(args): | |
632 | """Start a controller on localhost and engines using ssh. |
|
794 | """Start a controller on localhost and engines using ssh. | |
@@ -658,7 +820,8 b' def main_ssh(args):' | |||||
658 | cl = ControllerLauncher(extra_args=cont_args) |
|
820 | cl = ControllerLauncher(extra_args=cont_args) | |
659 | dstart = cl.start() |
|
821 | dstart = cl.start() | |
660 | def start_engines(cont_pid): |
|
822 | def start_engines(cont_pid): | |
661 |
ssh_set = SSHEngineSet(clusterfile['engines'], sshx=args.sshx |
|
823 | ssh_set = SSHEngineSet(clusterfile['engines'], sshx=args.sshx, | |
|
824 | copyenvs=args.copyenvs) | |||
662 | def shutdown(signum, frame): |
|
825 | def shutdown(signum, frame): | |
663 | d = ssh_set.kill() |
|
826 | d = ssh_set.kill() | |
664 | cl.interrupt_then_kill(1.0) |
|
827 | cl.interrupt_then_kill(1.0) | |
@@ -765,20 +928,77 b' def get_args():' | |||||
765 | help="how to call MPI_Init (default=mpi4py)" |
|
928 | help="how to call MPI_Init (default=mpi4py)" | |
766 | ) |
|
929 | ) | |
767 | parser_mpiexec.set_defaults(func=main_mpi, cmd='mpiexec') |
|
930 | parser_mpiexec.set_defaults(func=main_mpi, cmd='mpiexec') | |
768 |
|
931 | |||
769 | parser_pbs = subparsers.add_parser( |
|
932 | parser_pbs = subparsers.add_parser( | |
770 |
'pbs', |
|
933 | 'pbs', | |
771 | help='run a pbs cluster', |
|
934 | help='run a pbs cluster', | |
772 | parents=[base_parser] |
|
935 | parents=[base_parser] | |
773 | ) |
|
936 | ) | |
774 | parser_pbs.add_argument( |
|
937 | parser_pbs.add_argument( | |
|
938 | '-s', | |||
775 | '--pbs-script', |
|
939 | '--pbs-script', | |
776 |
type=str, |
|
940 | type=str, | |
777 | dest='pbsscript', |
|
941 | dest='pbsscript', | |
778 | help='PBS script template', |
|
942 | help='PBS script template', | |
779 |
default=' |
|
943 | default='' | |
|
944 | ) | |||
|
945 | parser_pbs.add_argument( | |||
|
946 | '-q', | |||
|
947 | '--queue', | |||
|
948 | type=str, | |||
|
949 | dest='pbsqueue', | |||
|
950 | help='PBS queue to use when starting the engines', | |||
|
951 | default=None, | |||
780 | ) |
|
952 | ) | |
781 | parser_pbs.set_defaults(func=main_pbs) |
|
953 | parser_pbs.set_defaults(func=main_pbs) | |
|
954 | ||||
|
955 | parser_sge = subparsers.add_parser( | |||
|
956 | 'sge', | |||
|
957 | help='run an sge cluster', | |||
|
958 | parents=[base_parser] | |||
|
959 | ) | |||
|
960 | parser_sge.add_argument( | |||
|
961 | '-s', | |||
|
962 | '--sge-script', | |||
|
963 | type=str, | |||
|
964 | dest='sgescript', | |||
|
965 | help='SGE script template', | |||
|
966 | default='' # SGEEngineSet will create one if not specified | |||
|
967 | ) | |||
|
968 | parser_sge.add_argument( | |||
|
969 | '-q', | |||
|
970 | '--queue', | |||
|
971 | type=str, | |||
|
972 | dest='sgequeue', | |||
|
973 | help='SGE queue to use when starting the engines', | |||
|
974 | default=None, | |||
|
975 | ) | |||
|
976 | parser_sge.set_defaults(func=main_sge) | |||
|
977 | ||||
|
978 | parser_lsf = subparsers.add_parser( | |||
|
979 | 'lsf', | |||
|
980 | help='run an lsf cluster', | |||
|
981 | parents=[base_parser] | |||
|
982 | ) | |||
|
983 | ||||
|
984 | parser_lsf.add_argument( | |||
|
985 | '-s', | |||
|
986 | '--lsf-script', | |||
|
987 | type=str, | |||
|
988 | dest='lsfscript', | |||
|
989 | help='LSF script template', | |||
|
990 | default='' # LSFEngineSet will create one if not specified | |||
|
991 | ) | |||
|
992 | ||||
|
993 | parser_lsf.add_argument( | |||
|
994 | '-q', | |||
|
995 | '--queue', | |||
|
996 | type=str, | |||
|
997 | dest='lsfqueue', | |||
|
998 | help='LSF queue to use when starting the engines', | |||
|
999 | default=None, | |||
|
1000 | ) | |||
|
1001 | parser_lsf.set_defaults(func=main_lsf) | |||
782 |
|
1002 | |||
783 | parser_ssh = subparsers.add_parser( |
|
1003 | parser_ssh = subparsers.add_parser( | |
784 | 'ssh', |
|
1004 | 'ssh', | |
@@ -786,6 +1006,14 b' def get_args():' | |||||
786 | parents=[base_parser] |
|
1006 | parents=[base_parser] | |
787 | ) |
|
1007 | ) | |
788 | parser_ssh.add_argument( |
|
1008 | parser_ssh.add_argument( | |
|
1009 | '-e', | |||
|
1010 | '--copyenvs', | |||
|
1011 | action='store_true', | |||
|
1012 | dest='copyenvs', | |||
|
1013 | help='Copy current shell environment to remote location', | |||
|
1014 | default=False, | |||
|
1015 | ) | |||
|
1016 | parser_ssh.add_argument( | |||
789 | '--clusterfile', |
|
1017 | '--clusterfile', | |
790 | type=str, |
|
1018 | type=str, | |
791 | dest='clusterfile', |
|
1019 | dest='clusterfile', |
@@ -53,7 +53,9 b' The :command:`ipcluster` command provides a simple way of starting a controller ' | |||||
53 | 2. When engines are started using the :command:`mpirun` command that comes |
|
53 | 2. When engines are started using the :command:`mpirun` command that comes | |
54 | with most MPI [MPI]_ implementations |
|
54 | with most MPI [MPI]_ implementations | |
55 | 3. When engines are started using the PBS [PBS]_ batch system. |
|
55 | 3. When engines are started using the PBS [PBS]_ batch system. | |
56 | 4. When the controller is started on localhost and the engines are started on |
|
56 | 4. When engines are started using the SGE [SGE]_ batch system. | |
|
57 | 5. When engines are started using the LSF [LSF]_ batch system. | |||
|
58 | 6. When the controller is started on localhost and the engines are started on | |||
57 | remote nodes using :command:`ssh`. |
|
59 | remote nodes using :command:`ssh`. | |
58 |
|
60 | |||
59 | .. note:: |
|
61 | .. note:: | |
@@ -126,49 +128,115 b' More details on using MPI with IPython can be found :ref:`here <parallelmpi>`.' | |||||
126 | Using :command:`ipcluster` in PBS mode |
|
128 | Using :command:`ipcluster` in PBS mode | |
127 | -------------------------------------- |
|
129 | -------------------------------------- | |
128 |
|
130 | |||
129 | The PBS mode uses the Portable Batch System [PBS]_ to start the engines. To use this mode, you first need to create a PBS script template that will be used to start the engines. Here is a sample PBS script template: |
|
131 | The PBS mode uses the Portable Batch System [PBS]_ to start the engines. | |
130 |
|
132 | |||
131 | .. sourcecode:: bash |
|
133 | To start an ipcluster using the Portable Batch System:: | |
132 |
|
||||
133 | #PBS -N ipython |
|
|||
134 | #PBS -j oe |
|
|||
135 | #PBS -l walltime=00:10:00 |
|
|||
136 | #PBS -l nodes=${n/4}:ppn=4 |
|
|||
137 | #PBS -q parallel |
|
|||
138 |
|
134 | |||
139 | cd $$PBS_O_WORKDIR |
|
135 | $ ipcluster pbs -n 12 | |
140 | export PATH=$$HOME/usr/local/bin |
|
|||
141 | export PYTHONPATH=$$HOME/usr/local/lib/python2.4/site-packages |
|
|||
142 | /usr/local/bin/mpiexec -n ${n} ipengine --logfile=$$PBS_O_WORKDIR/ipengine |
|
|||
143 |
|
136 | |||
144 | There are a few important points about this template: |
|
137 | The above command will launch a PBS job array with 12 tasks using the default queue. If you would like to submit the job to a different queue use the -q option: | |
145 |
|
138 | |||
146 | 1. This template will be rendered at runtime using IPython's :mod:`Itpl` |
|
139 | $ ipcluster pbs -n 12 -q hpcqueue | |
147 | template engine. |
|
|||
148 |
|
140 | |||
149 | 2. Instead of putting in the actual number of engines, use the notation |
|
141 | By default, ipcluster will generate and submit a job script to launch the engines. However, if you need to use your own job script use the -s option: | |
150 | ``${n}`` to indicate the number of engines to be started. You can also uses |
|
|||
151 | expressions like ``${n/4}`` in the template to indicate the number of |
|
|||
152 | nodes. |
|
|||
153 |
|
142 | |||
154 | 3. Because ``$`` is a special character used by the template engine, you must |
|
143 | $ ipcluster pbs -n 12 -q hpcqueue -s mypbscript.sh | |
155 | escape any ``$`` by using ``$$``. This is important when referring to |
|
|||
156 | environment variables in the template. |
|
|||
157 |
|
144 | |||
158 | 4. Any options to :command:`ipengine` should be given in the batch script |
|
145 | For example the default autogenerated script looks like:: | |
159 | template. |
|
|||
160 |
|
146 | |||
161 | 5. Depending on the configuration of you system, you may have to set |
|
147 | #PBS -q hpcqueue | |
162 | environment variables in the script template. |
|
148 | #!/bin/sh | |
|
149 | #PBS -V | |||
|
150 | #PBS -t 1-12 | |||
|
151 | #PBS -N ipengine | |||
|
152 | eid=$(($PBS_ARRAYID - 1)) | |||
|
153 | ipengine --logfile=ipengine${eid}.log | |||
163 |
|
154 | |||
164 | Once you have created such a script, save it with a name like :file:`pbs.template`. Now you are ready to start your job:: |
|
155 | .. note:: | |
165 |
|
156 | |||
166 | $ ipcluster pbs -n 128 --pbs-script=pbs.template |
|
157 | ipcluster relies on using PBS job arrays to start the | |
|
158 | engines. If you specify your own job script without specifying the | |||
|
159 | job array settings ipcluster will automatically add the job array | |||
|
160 | settings (#PBS -t 1-N) to your script. | |||
167 |
|
161 | |||
168 | Additional command line options for this mode can be found by doing:: |
|
162 | Additional command line options for this mode can be found by doing:: | |
169 |
|
163 | |||
170 | $ ipcluster pbs -h |
|
164 | $ ipcluster pbs -h | |
171 |
|
165 | |||
|
166 | Using :command:`ipcluster` in SGE mode | |||
|
167 | -------------------------------------- | |||
|
168 | ||||
|
169 | The SGE mode uses the Sun Grid Engine [SGE]_ to start the engines. | |||
|
170 | ||||
|
171 | To start an ipcluster using Sun Grid Engine:: | |||
|
172 | ||||
|
173 | $ ipcluster sge -n 12 | |||
|
174 | ||||
|
175 | The above command will launch an SGE job array with 12 tasks using the default queue. If you would like to submit the job to a different queue use the -q option: | |||
|
176 | ||||
|
177 | $ ipcluster sge -n 12 -q hpcqueue | |||
|
178 | ||||
|
179 | By default, ipcluster will generate and submit a job script to launch the engines. However, if you need to use your own job script use the -s option: | |||
|
180 | ||||
|
181 | $ ipcluster sge -n 12 -q hpcqueue -s mysgescript.sh | |||
|
182 | ||||
|
183 | For example the default autogenerated script looks like:: | |||
|
184 | ||||
|
185 | #$ -q hpcqueue | |||
|
186 | #$ -V | |||
|
187 | #$ -S /bin/sh | |||
|
188 | #$ -t 1-12 | |||
|
189 | #$ -N ipengine | |||
|
190 | eid=$(($SGE_TASK_ID - 1)) | |||
|
191 | ipengine --logfile=ipengine${eid}.log #$ -V | |||
|
192 | ||||
|
193 | .. note:: | |||
|
194 | ||||
|
195 | ipcluster relies on using SGE job arrays to start the engines. If | |||
|
196 | you specify your own job script without specifying the job array | |||
|
197 | settings ipcluster will automatically add the job array settings (#$ -t | |||
|
198 | 1-N) to your script. | |||
|
199 | ||||
|
200 | Additional command line options for this mode can be found by doing:: | |||
|
201 | ||||
|
202 | $ ipcluster sge -h | |||
|
203 | ||||
|
204 | Using :command:`ipcluster` in LSF mode | |||
|
205 | -------------------------------------- | |||
|
206 | ||||
|
207 | The LSF mode uses the Load Sharing Facility [LSF]_ to start the engines. | |||
|
208 | ||||
|
209 | To start an ipcluster using the Load Sharing Facility:: | |||
|
210 | ||||
|
211 | $ ipcluster lsf -n 12 | |||
|
212 | ||||
|
213 | The above command will launch an LSF job array with 12 tasks using the default queue. If you would like to submit the job to a different queue use the -q option: | |||
|
214 | ||||
|
215 | $ ipcluster lsf -n 12 -q hpcqueue | |||
|
216 | ||||
|
217 | By default, ipcluster will generate and submit a job script to launch the engines. However, if you need to use your own job script use the -s option: | |||
|
218 | ||||
|
219 | $ ipcluster lsf -n 12 -q hpcqueue -s mylsfscript.sh | |||
|
220 | ||||
|
221 | For example the default autogenerated script looks like:: | |||
|
222 | ||||
|
223 | #BSUB -q hpcqueue | |||
|
224 | #!/bin/sh | |||
|
225 | #BSUB -J ipengine[1-12] | |||
|
226 | eid=$(($LSB_JOBINDEX - 1)) | |||
|
227 | ipengine --logfile=ipengine${eid}.log | |||
|
228 | ||||
|
229 | .. note:: | |||
|
230 | ||||
|
231 | ipcluster relies on using LSF job arrays to start the engines. If you | |||
|
232 | specify your own job script without specifying the job array settings | |||
|
233 | ipcluster will automatically add the job array settings (#BSUB -J | |||
|
234 | ipengine[1-N]) to your script. | |||
|
235 | ||||
|
236 | Additional command line options for this mode can be found by doing:: | |||
|
237 | ||||
|
238 | $ ipcluster lsf -h | |||
|
239 | ||||
172 | Using :command:`ipcluster` in SSH mode |
|
240 | Using :command:`ipcluster` in SSH mode | |
173 | -------------------------------------- |
|
241 | -------------------------------------- | |
174 |
|
242 | |||
@@ -348,4 +416,6 b' the log files to us will often help us to debug any problems.' | |||||
348 |
|
416 | |||
349 |
|
417 | |||
350 | .. [PBS] Portable Batch System. http://www.openpbs.org/ |
|
418 | .. [PBS] Portable Batch System. http://www.openpbs.org/ | |
|
419 | .. [SGE] Sun Grid Engine. http://www.sun.com/software/sge/ | |||
|
420 | .. [LSF] Load Sharing Facility. http://www.platform.com/ | |||
351 | .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent |
|
421 | .. [SSH] SSH-Agent http://en.wikipedia.org/wiki/Ssh-agent |
General Comments 0
You need to be logged in to leave comments.
Login now