Skip to content

Commit 4327d7c

Browse files
yhwenIsaacYangSLA
authored andcommitted
added handle for the empty return code file.
1 parent 8cb6ab8 commit 4327d7c

File tree

3 files changed

+13
-9
lines changed

3 files changed

+13
-9
lines changed

nvflare/private/fed/client/client_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
427427
if child_process:
428428
child_process.wait()
429429

430-
return_code = get_return_code(child_process, job_id, workspace)
430+
return_code = get_return_code(child_process, job_id, workspace, self.logger)
431431

432432
self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}")
433433
if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]:

nvflare/private/fed/server/server_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def wait_for_complete(self, workspace, job_id, process):
211211
break
212212
time.sleep(0.1)
213213
with self.lock:
214-
return_code = get_return_code(process, job_id, workspace)
214+
return_code = get_return_code(process, job_id, workspace, self.logger)
215215
# if process exit but with Execution exception
216216
if return_code and return_code != 0:
217217
self.logger.info(f"Job: {job_id} child process exit with return code {return_code}")

nvflare/private/fed/utils/fed_utils.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -317,16 +317,20 @@ def get_target_names(targets):
317317
return target_names
318318

319319

320-
def get_return_code(process, job_id, workspace):
320+
def get_return_code(process, job_id, workspace, logger):
321321
run_dir = os.path.join(workspace, job_id)
322322
rc_file = os.path.join(run_dir, FLMetaKey.PROCESS_RC_FILE)
323-
try:
324-
if os.path.exists(rc_file):
323+
if os.path.exists(rc_file):
324+
try:
325325
with open(rc_file, "r") as f:
326326
return_code = int(f.readline())
327327
os.remove(rc_file)
328-
else:
328+
except Exception:
329+
logger.warning(
330+
f"Could not get the return_code from {rc_file} of the job:{job_id}, "
331+
f"Return the RC from the process:{process.pid}"
332+
)
329333
return_code = process.poll()
330-
return return_code
331-
except Exception:
332-
raise RuntimeError(f"Could not get the return_code of the {job_id} execution, process_id:{process.pid}")
334+
else:
335+
return_code = process.poll()
336+
return return_code

0 commit comments

Comments
 (0)