NVIDIA
diff --git a/‎examples/hello-world/hello-lr/job.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/hello-world/hello-lr/job.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎nvflare/recipe/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎nvflare/recipe/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎nvflare/recipe/poc_env.py‎
Lines changed: 23 additions & 51 deletions b/‎nvflare/recipe/poc_env.py‎
Lines changed: 23 additions & 51 deletions
diff --git a/‎nvflare/recipe/prod_env.py‎
Lines changed: 27 additions & 40 deletions b/‎nvflare/recipe/prod_env.py‎
Lines changed: 27 additions & 40 deletions
diff --git a/‎nvflare/recipe/run.py‎
Lines changed: 7 additions & 101 deletions b/‎nvflare/recipe/run.py‎
Lines changed: 7 additions & 101 deletions
@@ -17,7 +17,7 @@
 from nvflare.app_common.np.recipes.lr.fedavg import FedAvgLrRecipe
 from nvflare.recipe import SimEnv
 
-# from nvflare.recipe import POCEnv
+# from nvflare.recipe import PocEnv
 
 
 def define_parser():
@@ -45,7 +45,7 @@ def main():
         train_args=f"--data_root {data_root}",
     )
     env = SimEnv(num_clients=n_clients, num_threads=n_clients)
-    # env = POCEnv(num_clients=n_clients)
+    # env = PocEnv(num_clients=n_clients)
     run = recipe.execute(env)
     w = run.get_result()
     print("result location =", w)
 
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .poc_env import POCEnv
+from .poc_env import PocEnv
 from .prod_env import ProdEnv
 from .run import Run
 from .sim_env import SimEnv
 from .utils import add_experiment_tracking
 
-__all__ = ["SimEnv", "POCEnv", "ProdEnv", "Run", "add_experiment_tracking"]
+__all__ = ["SimEnv", "PocEnv", "ProdEnv", "Run", "add_experiment_tracking"]
@@ -14,14 +14,13 @@
 
 import os
 import shutil
-import tempfile
 import time
 from typing import Optional
 
 from pydantic import BaseModel, conint, model_validator
 
-from nvflare.fuel.flare_api.flare_api import new_secure_session
 from nvflare.job_config.api import FedJob
+from nvflare.recipe.spec import ExecEnv
 from nvflare.tool.poc.poc_commands import (
     _clean_poc,
     _start_poc,
@@ -34,7 +33,7 @@
 )
 from nvflare.tool.poc.service_constants import FlareServiceConstants as SC
 
-from .spec import ExecEnv, ExecEnvType
+from .session_mgr import SessionManager
 
 STOP_POC_TIMEOUT = 10
 SERVICE_START_TIMEOUT = 3
@@ -70,7 +69,7 @@ def check_client_configuration(self):
         return self
 
 
-class POCEnv(ExecEnv):
+class PocEnv(ExecEnv):
     """Proof of Concept execution environment for local testing and development.
 
     This environment sets up a POC deployment on a single machine with multiple
@@ -123,18 +122,7 @@ def __init__(
         self.project_conf_path = v.project_conf_path
         self.docker_image = v.docker_image
         self.username = v.username
-
-    def get_env_info(self) -> dict:
-        return {
-            "env_type": ExecEnvType.POC,
-            "startup_kit_location": self._get_admin_startup_kit_path(),
-            "num_clients": self.num_clients,
-            "gpu_ids": self.gpu_ids,
-            "use_he": self.use_he,
-            "docker_image": self.docker_image,
-            "project_conf_path": self.project_conf_path,
-            "username": self.username,
-        }
+        self._session_manager = None  # Lazy initialization
 
     def deploy(self, job: FedJob):
         """Deploy a FedJob to the POC environment.
@@ -170,12 +158,8 @@ def deploy(self, job: FedJob):
         # Give services time to start up
         time.sleep(SERVICE_START_TIMEOUT)
 
-        # Submit job using Flare API like ProdEnv
-        with tempfile.TemporaryDirectory() as temp_dir:
-            job.export_job(temp_dir)
-            job_path = os.path.join(temp_dir, job.name)
-
-            return self._submit_and_monitor_job(job_path, job.name)
+        # Submit job using SessionManager
+        return self._get_session_manager().submit_job(job)
 
     def _check_poc_running(self) -> bool:
         try:
@@ -225,37 +209,14 @@ def stop(self, clean_poc: bool = False):
         print(f"Removing POC workspace: {self.poc_workspace}")
         shutil.rmtree(self.poc_workspace, ignore_errors=True)
 
-    def _submit_and_monitor_job(self, job_path: str, job_name: str) -> str:
-        """Submit and monitor job via Flare API using a single session.
+    def get_job_status(self, job_id: str) -> Optional[str]:
+        return self._get_session_manager().get_job_status(job_id)
 
-        Args:
-            job_path: Path to the exported job directory.
-            job_name: Name of the job for logging.
+    def abort_job(self, job_id: str) -> None:
+        self._get_session_manager().abort_job(job_id)
 
-        Returns:
-            str: Job ID returned by the system.
-        """
-        sess = None
-        try:
-            # Get the admin startup kit path for POC
-            admin_dir = self._get_admin_startup_kit_path()
-
-            # Create secure session with POC admin (reuse for both submit and monitor)
-            sess = new_secure_session(
-                username=self.username,
-                startup_kit_location=admin_dir,
-            )
-
-            # Submit the job
-            job_id = sess.submit_job(job_path)
-
-            return job_id
-        except Exception as e:
-            raise RuntimeError(f"Failed to submit/monitor job via Flare API: {e}")
-
-        finally:
-            if sess:
-                sess.close()
+    def get_job_result(self, job_id: str, timeout: float = 0.0) -> Optional[str]:
+        return self._get_session_manager().get_job_result(job_id, timeout)
 
     def _get_admin_startup_kit_path(self) -> str:
         """Get the path to the admin startup kit for POC.
@@ -279,3 +240,14 @@ def _get_admin_startup_kit_path(self) -> str:
 
         except Exception as e:
             raise RuntimeError(f"Failed to locate admin startup kit: {e}")
+
+    def _get_session_manager(self):
+        """Get or create SessionManager with lazy initialization."""
+        if self._session_manager is None:
+            session_params = {
+                "username": self.username,
+                "startup_kit_location": self._get_admin_startup_kit_path(),
+                "timeout": self.get_extra_prop("login_timeout", 10),
+            }
+            self._session_manager = SessionManager(session_params)
+        return self._session_manager
@@ -13,32 +13,18 @@
 # limitations under the License.
 
 import os.path
-import tempfile
+from typing import Optional
 
 from pydantic import BaseModel, PositiveFloat, model_validator
 
-from nvflare.fuel.flare_api.flare_api import Session, new_secure_session
 from nvflare.job_config.api import FedJob
+from nvflare.recipe.spec import ExecEnv
 
-from .spec import ExecEnv, ExecEnvType
+from .session_mgr import SessionManager
 
 DEFAULT_ADMIN_USER = "[email protected]"
 
 
-def status_monitor_cb(session: Session, job_id: str, job_meta, *cb_args, **cb_kwargs) -> bool:
-    if job_meta["status"] == "RUNNING":
-        if cb_kwargs["cb_run_counter"]["count"] < 3 or cb_kwargs["cb_run_counter"]["count"] % 15 == 0:
-            print(job_meta)
-        else:
-            # avoid printing job_meta repeatedly to save space on the screen and not overwhelm the user
-            print(".", end="")
-    else:
-        print("\n" + str(job_meta))
-
-    cb_kwargs["cb_run_counter"]["count"] += 1
-    return True
-
-
 # Internal — not part of the public API
 class _ProdEnvValidator(BaseModel):
     startup_kit_location: str
@@ -81,30 +67,31 @@ def __init__(
         self.startup_kit_location = v.startup_kit_location
         self.login_timeout = v.login_timeout
         self.username = v.username
+        self._session_manager = None  # Lazy initialization
+
+    def get_job_status(self, job_id: str) -> Optional[str]:
+        return self._get_session_manager().get_job_status(job_id)
+
+    def abort_job(self, job_id: str) -> None:
+        self._get_session_manager().abort_job(job_id)
+
+    def get_job_result(self, job_id: str, timeout: float = 0.0) -> Optional[str]:
+        return self._get_session_manager().get_job_result(job_id, timeout)
 
     def deploy(self, job: FedJob):
-        sess = None
+        """Deploy a job using SessionManager."""
         try:
-            sess = new_secure_session(
-                username=self.username, startup_kit_location=self.startup_kit_location, timeout=self.login_timeout
-            )
-            with tempfile.TemporaryDirectory() as temp_dir:
-                job.export_job(temp_dir)
-                job_path = os.path.join(temp_dir, job.name)
-                job_id = sess.submit_job(job_path)
-                print(f"Submitted job '{job.name}' with ID: {job_id}")
-
-            return job_id
+            return self._get_session_manager().submit_job(job)
         except Exception as e:
-            raise RuntimeError(f"Failed to submit/monitor job via Flare API: {e}")
-        finally:
-            if sess:
-                sess.close()
-
-    def get_env_info(self) -> dict:
-        return {
-            "env_type": ExecEnvType.PROD,
-            "startup_kit_location": self.startup_kit_location,
-            "login_timeout": self.login_timeout,
-            "username": self.username,
-        }
+            raise RuntimeError(f"Failed to submit job via Flare API: {e}")
+
+    def _get_session_manager(self):
+        """Get or create SessionManager with lazy initialization."""
+        if self._session_manager is None:
+            session_params = {
+                "username": self.username,
+                "startup_kit_location": self.startup_kit_location,
+                "timeout": self.login_timeout,
+            }
+            self._session_manager = SessionManager(session_params)
+        return self._session_manager
@@ -12,110 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from contextlib import contextmanager
-from typing import Generator, Optional
+from typing import Optional
 
-from nvflare.fuel.flare_api.api_spec import MonitorReturnCode
-from nvflare.fuel.flare_api.flare_api import Session, new_secure_session
-
-
-def _cb_with_print(session: Session, job_id: str, job_meta, *cb_args, **cb_kwargs) -> bool:
-    """Callback to print job meta."""
-    # cb_run_counter is a dictionary that is passed to the callback and is used to keep track of the number of times the callback has been called
-    if cb_kwargs["cb_run_counter"]["count"] == 0:
-        print("Job ID: ", job_id)
-        print("Job Meta: ", job_meta)
-
-    if job_meta["status"] == "RUNNING":
-        print(".", end="")
-    else:
-        print("\n" + str(job_meta))
-
-    cb_kwargs["cb_run_counter"]["count"] += 1
-    return True
+from nvflare.recipe.spec import ExecEnv
 
 
 class Run:
-    def __init__(self, env_info: dict, job_id: str):
-        self.env_info = env_info
+    def __init__(self, exec_env: ExecEnv, job_id: str):
+        self.exec_env = exec_env
         self.job_id = job_id
-        self.handlers = {
-            "sim": self._get_sim_result,
-            "poc": self._get_prod_result,
-            "prod": self._get_prod_result,
-        }
 
     def get_job_id(self) -> str:
         return self.job_id
 
-    def _is_sim_env(self) -> bool:
-        """Check if this is a simulation environment."""
-        return self.env_info.get("env_type") == "sim"
-
-    def _get_session_params(self) -> dict:
-        """Get session parameters from env_info."""
-        return {
-            "startup_kit_location": self.env_info.get("startup_kit_location"),
-            "username": self.env_info.get("username"),
-            "timeout": self.env_info.get("login_timeout", 10),
-        }
-
-    @contextmanager
-    def _secure_session(self) -> Generator:
-        """Context manager for secure session handling."""
-        sess = None
-        try:
-            sess = new_secure_session(**self._get_session_params())
-            yield sess
-        except Exception as e:
-            raise RuntimeError(f"Failed to create/use session: {e}")
-        finally:
-            if sess:
-                sess.close()
-
     def get_status(self) -> Optional[str]:
         """Get the status of the run.
 
         Returns:
             Optional[str]: The status of the run, or None if called in a simulation environment.
         """
-        if self._is_sim_env():
-            print(
-                "get_status is not supported in a simulation environment, please check the log inside the workspace returned by get_result()"
-            )
-            return None
-
-        with self._secure_session() as sess:
-            return sess.get_job_status(self.job_id)
-
-    def _get_sim_result(self, **kwargs) -> str:
-        workspace_root = self.env_info.get("workspace_root")
-        if workspace_root is None:
-            raise RuntimeError("Simulation workspace_root is None - SimEnv may not be properly initialized")
-        return os.path.join(workspace_root, self.job_id)
-
-    def _get_prod_result(self, timeout: float = 0.0) -> Optional[str]:
-        with self._secure_session() as sess:
-            cb_run_counter = {"count": 0}
-            rc = sess.monitor_job(self.job_id, timeout=timeout, cb=_cb_with_print, cb_run_counter=cb_run_counter)
-            print(f"job monitor done: {rc=}")
-            if rc == MonitorReturnCode.JOB_FINISHED:
-                return sess.download_job_result(self.job_id)
-            elif rc == MonitorReturnCode.TIMEOUT:
-                print(
-                    f"Job {self.job_id} did not complete within {timeout} seconds. "
-                    "Job is still running. Try calling get_result() again with a longer timeout."
-                )
-                return None
-            elif rc == MonitorReturnCode.ENDED_BY_CB:
-                print(
-                    "Job monitoring was stopped early by callback. "
-                    "Result may not be available yet. Check job status and try again."
-                )
-                return None
-            else:
-                raise RuntimeError(f"Unexpected monitor return code: {rc}")
+        return self.exec_env.get_job_status(self.job_id)
 
     def get_result(self, timeout: float = 0.0) -> Optional[str]:
         """Get the result workspace of the run.
@@ -127,18 +43,8 @@ def get_result(self, timeout: float = 0.0) -> Optional[str]:
         Returns:
             Optional[str]: The result workspace path if job completed, None if still running or stopped early.
         """
-        env_type = self.env_info.get("env_type")
-        return self.handlers[env_type](timeout=timeout)
+        return self.exec_env.get_job_result(self.job_id, timeout=timeout)
 
     def abort(self):
         """Abort the running job."""
-        if self._is_sim_env():
-            print("abort is not supported in a simulation environment, it will always run to completion.")
-            return
-
-        try:
-            with self._secure_session() as sess:
-                msg = sess.abort_job(self.job_id)
-                print(f"Job {self.job_id} aborted successfully with message: {msg}")
-        except Exception as e:
-            print(f"Failed to abort job {self.job_id}: {e}")
+        self.exec_env.abort_job(self.job_id)