Skip to content

Commit d445a09

Browse files
authored
Added ability to send the client job heartbeat calls to server. (#2016)
* Added ability to send the client job heartbeat calls to server. * codestyle fixes. * Changed to use aux_message for sending the client job heartbeat calls. * Made the thread daemon=true.
1 parent 85b4e1a commit d445a09

File tree

3 files changed

+35
-0
lines changed

3 files changed

+35
-0
lines changed

nvflare/apis/fl_constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ class ReservedTopic(object):
162162
END_RUN = "__end_run__"
163163
ABORT_ASK = "__abort_task__"
164164
AUX_COMMAND = "__aux_command__"
165+
JOB_HEART_BEAT = "__job_heartbeat__"
165166

166167

167168
class AdminCommandNames(object):

nvflare/private/fed/client/client_runner.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from nvflare.apis.shareable import Shareable, make_reply
2323
from nvflare.apis.signal import Signal
2424
from nvflare.apis.utils.fl_context_utils import add_job_audit_event
25+
from nvflare.fuel.f3.cellnet.fqcn import FQCN
2526
from nvflare.private.defs import SpecialTaskName, TaskConstant
2627
from nvflare.private.fed.client.client_engine_executor_spec import ClientEngineExecutorSpec, TaskAssignment
2728
from nvflare.private.privacy_manager import Scope
@@ -392,6 +393,9 @@ def _check_stop_conditions(self, fl_ctx: FLContext) -> bool:
392393
return False
393394

394395
def _try_run(self):
396+
heartbeat_thread = threading.Thread(target=self.send_job_heartbeat, args=[], daemon=True)
397+
heartbeat_thread.start()
398+
395399
while not self.asked_to_stop:
396400
with self.engine.new_context() as fl_ctx:
397401
if self._check_stop_conditions(fl_ctx):
@@ -404,6 +408,25 @@ def _try_run(self):
404408

405409
time.sleep(task_fetch_interval)
406410

411+
def send_job_heartbeat(self, interval=30.0):
412+
wait_times = int(interval / 2)
413+
request = Shareable()
414+
while not self.asked_to_stop:
415+
with self.engine.new_context() as fl_ctx:
416+
self.engine.send_aux_request(
417+
targets=[FQCN.ROOT_SERVER],
418+
topic=ReservedTopic.JOB_HEART_BEAT,
419+
request=request,
420+
timeout=0,
421+
fl_ctx=fl_ctx,
422+
optional=True,
423+
)
424+
425+
for i in range(wait_times):
426+
time.sleep(2)
427+
if self.asked_to_stop:
428+
break
429+
407430
def fetch_and_run_one_task(self, fl_ctx) -> (float, bool):
408431
"""Fetches and runs a task.
409432

nvflare/private/fed/server/server_runner.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ def __init__(self, config: ServerRunnerConfig, job_id: str, engine: ServerEngine
9999
self.status = "init"
100100
self.turn_to_cold = False
101101

102+
self._register_aux_message_handler(engine)
103+
104+
def _register_aux_message_handler(self, engine):
105+
engine.register_aux_message_handler(
106+
topic=ReservedTopic.JOB_HEART_BEAT, message_handle_func=self._handle_job_heartbeat
107+
)
108+
102109
def _execute_run(self):
103110
while self.current_wf_index < len(self.config.workflows):
104111
wf = self.config.workflows[self.current_wf_index]
@@ -489,6 +496,10 @@ def process_submission(self, client: Client, task_name: str, task_id: str, resul
489496
"Error processing client result by {}: {}".format(self.current_wf.id, secure_format_exception(e)),
490497
)
491498

499+
def _handle_job_heartbeat(self, topic: str, request: Shareable, fl_ctx: FLContext) -> Shareable:
500+
self.log_info(fl_ctx, "received client job_heartbeat aux request")
501+
return make_reply(ReturnCode.OK)
502+
492503
def abort(self, fl_ctx: FLContext, turn_to_cold: bool = False):
493504
self.status = "done"
494505
self.abort_signal.trigger(value=True)

0 commit comments

Comments
 (0)