diff --git a/IPython/parallel/engine/engine.py b/IPython/parallel/engine/engine.py index 94e27de..840442e 100644 --- a/IPython/parallel/engine/engine.py +++ b/IPython/parallel/engine/engine.py @@ -25,7 +25,7 @@ from zmq.eventloop import ioloop, zmqstream from IPython.external.ssh import tunnel # internal from IPython.utils.traitlets import ( - Instance, Dict, Integer, Type, CFloat, CInt, Unicode, CBytes, Bool + Instance, Dict, Integer, Type, Float, Integer, Unicode, CBytes, Bool ) from IPython.utils.py3compat import cast_bytes @@ -50,13 +50,15 @@ class EngineFactory(RegistrationFactory): help="""The location (an IP address) of the controller. This is used for disambiguating URLs, to determine whether loopback should be used to connect or the public address.""") - timeout=CFloat(5, config=True, + timeout=Float(5.0, config=True, help="""The time (in seconds) to wait for the Controller to respond to registration requests before giving up.""") - hb_check_period=CFloat(5, config=True, - help="""The time (in seconds) to check for a heartbeat ping from the - Controller.""") - hb_max_misses=CInt(5, config=True, + hb_check_period=Integer(5000, config=True, + help="""The time (in ms) to check for a heartbeat ping from the + Controller. Ensure that check period is bigger than the heartbeat period + from the controller (set via "HeartMonitor.period" in the Controller config) + so that at least one ping is received during each check period.""") + hb_max_misses=Integer(5, config=True, help="""The maximum number of times a check for the heartbeat ping of a controller can be missed before shutting down the engine.""") sshserver=Unicode(config=True, @@ -262,12 +264,12 @@ class EngineFactory(RegistrationFactory): self._hb_listener.flush() if self._hb_last_monitored > self._hb_last_pinged: self._hb_missed_beats += 1 - self.log.warn("No heartbeat in the last %s seconds.", self.hb_check_period) + self.log.warn("No heartbeat in the last %s ms (%s time(s) in a row).", self.hb_check_period, self._hb_missed_beats) else: self._hb_missed_beats = 0 if self._hb_missed_beats >= self.hb_max_misses: - self.log.fatal("Maximum number of heartbeats misses reached (%s times %s seconds), shutting down.", + self.log.fatal("Maximum number of heartbeats misses reached (%s times %s ms), shutting down.", self.hb_max_misses, self.hb_check_period) self.session.send(self.registrar, "unregistration_request", content=dict(id=self.id)) self.loop.stop() @@ -281,7 +283,7 @@ class EngineFactory(RegistrationFactory): self._abort_dc = ioloop.DelayedCallback(self.abort, self.timeout*1000, self.loop) self._abort_dc.start() # periodically check the heartbeat pings of the controller - self._hb_reporter = ioloop.PeriodicCallback(self._hb_monitor, self.hb_check_period* 1000, self.loop) + self._hb_reporter = ioloop.PeriodicCallback(self._hb_monitor, self.hb_check_period, self.loop) self._hb_reporter.start()