Enhance recipe experiment tracking (#3655)

YuanTingHsieh · Copilot · web-flow · commit d66d14e8f019 · 2025-09-04T17:00:20.000-07:00
For MLFlowReceiver, it will be good to include job name.

recipe.job.to_server(receiver, "receiver") this one is hardcoded for now
to be consistent with BaseFedJob, in next release we need to fix this as
well.

### Description
MLFlowReceiver:
- If no run name is provided, include the job name for the default run
name.

### Types of changes
&lt;!--- Put an `x` in all the boxes that apply, and remove the not
applicable items --&gt;
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Quick tests passed locally by running `./runtest.sh`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated.

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/examples/hello-world/hello-tf/client.py b/examples/hello-world/hello-tf/client.py
@@ -16,12 +16,14 @@
 from model import Net
 
 import nvflare.client as flare
+from nvflare.client.tracking import SummaryWriter
 
 WEIGHTS_PATH = "./tf_model.weights.h5"
 
 
 def main():
     flare.init()
+    writer = SummaryWriter()
 
     sys_info = flare.system_info()
     print(f"system info is: {sys_info}", flush=True)
@@ -69,6 +71,7 @@ def main():
         print(
             f"Accuracy of the received model on round {input_model.current_round} on the test images: {test_global_acc * 100} %"
         )
+        writer.add_scalar(tag="local_acc", scalar=test_global_acc)
 
         # training
         model.fit(train_images, train_labels, epochs=1, validation_data=(test_images, test_labels))
diff --git a/nvflare/app_opt/tracking/mlflow/mlflow_receiver.py b/nvflare/app_opt/tracking/mlflow/mlflow_receiver.py
@@ -23,8 +23,9 @@
 
 from nvflare.apis.analytix import ANALYTIC_EVENT_TYPE, AnalyticsData, AnalyticsDataType, LogWriterName, TrackConst
 from nvflare.apis.dxo import from_shareable
-from nvflare.apis.fl_constant import ProcessType
+from nvflare.apis.fl_constant import ProcessType, ReservedKey
 from nvflare.apis.fl_context import FLContext
+from nvflare.apis.job_def import JobMetaKey
 from nvflare.apis.shareable import Shareable
 from nvflare.app_common.widgets.streaming import AnalyticsReceiver
 
@@ -41,6 +42,14 @@ def get_current_time_millis():
     return int(round(time.time() * 1000))
 
 
+def _get_job_name_from_fl_ctx(fl_ctx: FLContext, default=None):
+    # TODO: it might be good to have a function in fl_context to get the job name
+    job_meta = fl_ctx.get_prop(ReservedKey.JOB_META)
+    if job_meta and isinstance(job_meta, dict):
+        return job_meta.get(JobMetaKey.JOB_NAME, default)
+    return default
+
+
 class MLflowReceiver(AnalyticsReceiver):
     def __init__(
         self,
@@ -73,14 +82,15 @@ def __init__(
                 less delay. Keep in mind that reducing the buffer_flush_time will potentially cause high
                 traffic to the MLflow tracking server, which in some cases can actually cause more latency.
         """
+        if not isinstance(tracking_uri, (str, type(None))):
+            raise ValueError("tracking_uri needs to be either None or str")
         if events is None:
             events = ["fed." + ANALYTIC_EVENT_TYPE]
         super().__init__(events=events)
         self.artifact_location = artifact_location if artifact_location is not None else "artifacts"
 
         self.kw_args = kw_args if kw_args else {}
         self.tracking_uri = tracking_uri
-        self.mlflow = mlflow
         self.mlflow_clients: Dict[str, MlflowClient] = {}
         self.experiment_id = None
         self.run_ids = {}
@@ -164,8 +174,9 @@ def _mlflow_setup(self, art_full_path, experiment_name, experiment_tags, site_na
                 )
 
                 job_id_tag = self._get_job_id_tag(fl_ctx)
+                job_name = _get_job_name_from_fl_ctx(fl_ctx)
 
-                run_name = self._get_run_name(self.kw_args, site_name, job_id_tag)
+                run_name = self._get_run_name(self.kw_args, site_name, job_id_tag, job_name)
                 tags = self._get_run_tags(self.kw_args, job_id_tag, run_name)
                 run = mlflow_client.create_run(experiment_id=self.experiment_id, run_name=run_name, tags=tags)
                 self.run_ids[site_name] = run.info.run_id
@@ -179,9 +190,10 @@ def _init_buffer(self, site_names: List[str]):
                 AnalyticsDataType.TAGS: [],
             }
 
-    def _get_run_name(self, kwargs: dict, site_name: str, job_id_tag: str):
+    def _get_run_name(self, kwargs: dict, site_name: str, job_id_tag: str, job_name: str):
         run_name = kwargs.get(TrackConst.RUN_NAME, DEFAULT_RUN_NAME)
-        return f"{site_name}-{job_id_tag[:6]}-{run_name}"
+        job_name_str = job_name if job_name is not None else "unknown_job"
+        return f"{site_name}-{job_id_tag[:6]}-{job_name_str}-{run_name}"
 
     def _get_run_tags(self, kwargs, job_id_tag: str, run_name: str):
         run_tags = self._get_tags(TrackConst.RUN_TAGS, kwargs=kwargs)
diff --git a/nvflare/recipe/utils.py b/nvflare/recipe/utils.py
@@ -56,4 +56,4 @@ def add_experiment_tracking(recipe: Recipe, tracking_type: str, tracking_config:
     module = importlib.import_module(TRACKING_REGISTRY[tracking_type]["receiver_module"])
     receiver_class = getattr(module, TRACKING_REGISTRY[tracking_type]["receiver_class"])
     receiver = receiver_class(**tracking_config)
-    recipe.job.to_server(receiver)
+    recipe.job.to_server(receiver, "receiver")