Draft of validation script

jleinonen · jleinonen · commit 59125551b134 · 2025-10-23T09:42:03.000-07:00
diff --git a/examples/weather/temporal_interpolation/README.md b/examples/weather/temporal_interpolation/README.md
@@ -69,7 +69,8 @@ To train a temporal interpolation model, ensure you have the following:
   containing a 1D array with length equal to the number of variables in the dataset,
   with each value giving the mean (for `global_means.npy`) or standard deviation (for
   `global_stds.npy`) of the corresponding variable.
-* A JSON file with metadata about the contents of the HDF5 files. Refer to [data sample](https://github.com/NVIDIA/physicsnemo/blob/main/examples/weather/temporal_interpolation/data.json)
+* A JSON file with metadata about the contents of the HDF5 files. Refer to the [data
+  sample](https://github.com/NVIDIA/physicsnemo/blob/main/examples/weather/temporal_interpolation/data/data.json)
   for an example describing the dataset used to train the original model.
 * Optional: NetCDF4 files containing the orography and land-sea mask for the grid
   contained in the data. These should contain a variable of the same shape as the data.
diff --git a/examples/weather/temporal_interpolation/config/train_interp.yaml b/examples/weather/temporal_interpolation/config/train_interp.yaml
@@ -46,9 +46,10 @@ training:
   samples_per_epoch: 50000 # number of samples per "epoch"
   load_epoch: "latest" # int, null or "latest"; "latest" loads the most recent checkpoint in checkpoint_dir
   checkpoint_dir: "/checkpoints/fcinterp/" # location where checkpoints are saved
-  optimizer_params:
-    lr: 5e-4 # learning rate
-    betas: [0.9, 0.95] # beta parameters for Adam
+
+optimizer_params:
+  lr: 5e-4 # learning rate
+  betas: [0.9, 0.95] # beta parameters for Adam
 
 logging:
   mlflow:
diff --git a/examples/weather/temporal_interpolation/config/train_interp_lite.yaml b/examples/weather/temporal_interpolation/config/train_interp_lite.yaml
@@ -50,9 +50,10 @@ training:
   samples_per_epoch: 50 # number of samples per "epoch"
   load_epoch: "latest" # int, null or "latest"; "latest" loads the most recent checkpoint in checkpoint_dir
   checkpoint_dir: "/checkpoints/fcinterp/" # location where checkpoints are saved
-  optimizer_params:
-    lr: 5e-4 # learning rate
-    betas: [0.9, 0.95] # beta parameters for Adam
+
+optimizer_params:
+  lr: 5e-4 # learning rate
+  betas: [0.9, 0.95] # beta parameters for Adam
 
 logging:
   mlflow:
diff --git a/examples/weather/temporal_interpolation/data/data.json b/examples/weather/temporal_interpolation/data/data.json
diff --git a/examples/weather/temporal_interpolation/datapipe/climate_interp.py b/examples/weather/temporal_interpolation/datapipe/climate_interp.py
@@ -70,9 +70,6 @@ def __call__(
 
         # Shuffle before the next epoch starts
         if self.shuffle and sample_info.epoch_idx != self.last_epoch:
-            # All workers use the same rng seed so the resulting
-            # indices are the same across workers
-            # np.random.default_rng(seed=sample_info.epoch_idx).shuffle(self.indices)
             print("Shuffling indices")
             np.random.shuffle(self.indices)
             self.last_epoch = sample_info.epoch_idx
diff --git a/examples/weather/temporal_interpolation/train.py b/examples/weather/temporal_interpolation/train.py
@@ -16,6 +16,8 @@
 
 import os
 import datetime
+from typing import Any
+import warnings
 
 import hydra
 from omegaconf import OmegaConf
@@ -36,6 +38,11 @@
 from utils import distribute, loss
 from utils.trainer import Trainer
 
+try:
+    from apex.optimizers import FusedAdam
+except ImportError:
+    warnings.warn("Apex is not installed, defaulting to PyTorch optimizers.")
+
 
 def setup_datapipes(
     *,
@@ -182,6 +189,10 @@ def setup_model(
 
     Parameters
     ----------
+    num_variables : int
+        Number of atmospheric variables in the model.
+    num_auxiliaries : int
+        Number of auxiliary input channels.
     model_cfg : dict or None, optional
         Model configuration dict.
 
@@ -213,17 +224,70 @@ def setup_model(
     return model
 
 
+def setup_optimizer(
+    model: torch.nn.Module,
+    max_epoch: int,
+    opt_cls: type[torch.optim.Optimizer] | None = None,
+    opt_params: dict | None = None,
+    scheduler_cls: type[torch.optim.lr_scheduler.LRScheduler] | None = None,
+    scheduler_params: dict[str, Any] | None = None,
+) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
+    """Setup optimizer.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model that optimizer is applied to.
+    max_epoch : int
+        Maximum number of training epochs (used for scheduler setup).
+    opt_cls : type[torch.optim.Optimizer] or None, optional
+        Optimizer class. When None, will setup apex.optimizers.FusedAdam
+        if available, otherwise PyTorch Adam.
+    opt_params : dict or None, optional
+        Dict of parameters (e.g. learning rate) to pass to optimizer.
+    scheduler_cls : type[torch.optim.lr_scheduler.LRScheduler] or None, optional
+        Scheduler class. When None, will setup CosineAnnealingLR.
+    scheduler_params : dict[str, Any] or None, optional
+        Dict of parameters to pass to scheduler.
+
+    Returns
+    -------
+    tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]
+        The initialized optimizer and learning rate scheduler.
+    """
+
+    opt_kwargs = {"lr": 0.0005}
+    if opt_params is not None:
+        opt_kwargs.update(opt_params)
+    if opt_cls is None:
+        try:
+            opt_cls = FusedAdam
+        except NameError:  # in case we don't have apex
+            opt_cls = torch.optim.Adam
+
+    scheduler_kwargs = {}
+    if scheduler_cls is None:
+        scheduler_cls = torch.optim.lr_scheduler.CosineAnnealingLR
+        scheduler_kwargs["T_max"] = max_epoch
+    if scheduler_params is not None:
+        scheduler_kwargs.update(scheduler_params)
+
+    optimizer = opt_cls(model.parameters(), **opt_kwargs)
+    scheduler = scheduler_cls(optimizer, **scheduler_kwargs)
+    return (optimizer, scheduler)
+
+
 @torch.no_grad()
 def input_output_from_batch_data(
-    batch: dict[str, torch.Tensor], time_scale: float = 6 * 3600.0
+    batch: list[dict[str, torch.Tensor]], time_scale: float = 6 * 3600.0
 ) -> tuple[tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
     """
     Convert the datapipe output dict to model input and output batches.
 
     Parameters
     ----------
-    batch : dict[str, torch.Tensor]
-        The data dict returned by the datapipe.
+    batch : list[dict[str, torch.Tensor]]
+        The list data dicts returned by the datapipe.
     time_scale : float, optional
         Number of seconds between the interpolation endpoints (default 6 hours).
 
@@ -235,16 +299,17 @@ def input_output_from_batch_data(
     batch = batch[0]
     # Concatenate all input variables to a single tensor
     atmos_vars = batch["state_seq-atmos"]
-    cos_zenith = batch["cos_zenith-atmos"].squeeze(dim=2)
 
-    sincos_latlon = batch["latlon"]
-    geop = batch["geopotential"]
-    lsm = batch["land_sea_mask"]
-
-    atmos_vars_in = torch.cat(
-        [atmos_vars[:, 0], atmos_vars[:, 1], cos_zenith, sincos_latlon, geop, lsm],
-        dim=1,
-    )
+    atmos_vars_in = [atmos_vars[:, 0], atmos_vars[:, 1]]
+    if "cos_zenith-atmos" in batch:
+        atmos_vars_in = atmos_vars_in + [batch["cos_zenith-atmos"].squeeze(dim=2)]
+    if "latlon" in batch:
+        atmos_vars_in = atmos_vars_in + [batch["latlon"]]
+    if "geopotential" in batch:
+        atmos_vars_in = atmos_vars_in + [batch["geopotential"]]
+    if "land_sea_mask" in batch:
+        atmos_vars_in = atmos_vars_in + [batch["land_sea_mask"]]
+    atmos_vars_in = torch.cat(atmos_vars_in, dim=1)
 
     atmos_vars_out = atmos_vars[:, 2]
 
@@ -286,6 +351,15 @@ def setup_trainer(**cfg: dict) -> Trainer:
     )
     (model, dist_manager) = distribute.distribute_model(model)
 
+    # Setup optimizer and learning rate scheduler
+    (optimizer, scheduler) = setup_optimizer(
+        model,
+        cfg["training"].get("max_epoch", 1),
+        opt_params=cfg.get("optimizer_params", {}),
+        scheduler_params=cfg.get("scheduler_params", {}),
+    )
+
+    # Initialize mlflow
     mlflow_cfg = cfg.get("logging", {}).get("mlflow", {})
     if mlflow_cfg.pop("use_mlflow", False):
         initialize_mlflow(**mlflow_cfg)
@@ -334,6 +408,8 @@ def setup_trainer(**cfg: dict) -> Trainer:
         train_datapipe=train_datapipe,
         valid_datapipe=valid_datapipe,
         input_output_from_batch_data=input_output_from_batch_data,
+        optimizer=optimizer,
+        scheduler=scheduler,
         use_wandb=use_wandb,
         **cfg["training"],
     )
diff --git a/examples/weather/temporal_interpolation/utils/trainer.py b/examples/weather/temporal_interpolation/utils/trainer.py
@@ -16,7 +16,6 @@
 
 from collections.abc import Callable, Sequence
 from typing import Any, Literal
-import warnings
 import time
 
 import torch
@@ -29,11 +28,6 @@
 from physicsnemo.launch.logging import LaunchLogger, PythonLogger
 from physicsnemo.launch.utils import load_checkpoint, save_checkpoint
 
-try:
-    from apex.optimizers import FusedAdam
-except ImportError:
-    warnings.warn("Apex is not installed, defaulting to PyTorch optimizers.")
-
 
 class Trainer:
     """Training loop.
@@ -52,18 +46,13 @@ class Trainer:
         ClimateDatapipe providing validation data.
     samples_per_epoch : int
         Number of samples to draw from the datapipe per 'epoch'.
+    optimizer : torch.optim.Optimizer
+        Optimizer used for training.
+    scheduler : torch.optim.lr_scheduler.LRScheduler
+        Learning rate scheduler.
     input_output_from_batch_data : Callable, optional
         Function that converts datapipe outputs to training batches.
         If not provided, will try to use outputs as-is.
-    optimizer : type[torch.optim.Optimizer] or None, optional
-        Optimizer class used for training. When None, will setup
-        apex.optimizers.FusedAdam if available, otherwise PyTorch Adam.
-    optimizer_params : dict[str, Any] or None, optional
-        Dict of parameters (e.g. learning rate) to pass to optimizer.
-    scheduler : type[torch.optim.lr_scheduler.LRScheduler] or None, optional
-        Learning rate scheduler class. When None, will setup CosineAnnealingLR.
-    scheduler_params : dict[str, Any] or None, optional
-        Dict of parameters to pass to LR scheduler.
     max_epoch : int, optional
         The last training epoch.
     load_epoch : int, "latest", or None, optional
@@ -90,11 +79,9 @@ def __init__(
         train_datapipe: ClimateDatapipe,
         valid_datapipe: ClimateDatapipe,
         samples_per_epoch: int,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
         input_output_from_batch_data: Callable = lambda x: x,
-        optimizer: type[torch.optim.Optimizer] | None = None,
-        optimizer_params: dict[str, Any] | None = None,
-        scheduler: type[torch.optim.lr_scheduler.LRScheduler] | None = None,
-        scheduler_params: dict[str, Any] | None = None,
         max_epoch: int = 1,
         load_epoch: int | Literal["latest"] | None = "latest",
         checkpoint_every: int = 1,
@@ -110,13 +97,8 @@ def __init__(
         self.valid_datapipe = valid_datapipe
         self.max_epoch = max_epoch
         self.input_output_from_batch_data = input_output_from_batch_data
-        self.optimizer, self.lr_scheduler = self.setup_optimizer(
-            model,
-            opt_cls=optimizer,
-            opt_params=optimizer_params,
-            scheduler_cls=scheduler,
-            scheduler_params=scheduler_params,
-        )
+        self.optimizer = optimizer
+        self.lr_scheduler = scheduler
         self.validation_callbacks = validation_callbacks
         self.device = self.dist_manager.device
         self.logger = PythonLogger()
@@ -309,57 +291,6 @@ def validate_on_epoch(self) -> torch.Tensor:
             model.train()
         return loss_epoch / num_examples
 
-    def setup_optimizer(
-        self,
-        model: torch.nn.Module,
-        opt_cls: type[torch.optim.Optimizer] | None = None,
-        opt_params: dict | None = None,
-        scheduler_cls: type[torch.optim.lr_scheduler.LRScheduler] | None = None,
-        scheduler_params: dict[str, Any] | None = None,
-    ) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
-        """Setup optimizer.
-
-        Parameters
-        ----------
-        model : torch.nn.Module
-            Model that optimizer is applied to.
-        opt_cls : type[torch.optim.Optimizer] or None, optional
-            Optimizer class. When None, will setup apex.optimizers.FusedAdam
-            if available, otherwise PyTorch Adam.
-        opt_params : dict or None, optional
-            Dict of parameters (e.g. learning rate) to pass to optimizer.
-        scheduler_cls : type[torch.optim.lr_scheduler.LRScheduler] or None, optional
-            Scheduler class. When None, will setup CosineAnnealingLR.
-        scheduler_params : dict[str, Any] or None, optional
-            Dict of parameters to pass to scheduler.
-
-        Returns
-        -------
-        tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]
-            The initialized optimizer and learning rate scheduler.
-        """
-
-        opt_kwargs = {"lr": 0.0005}
-        if opt_params is not None:
-            opt_kwargs.update(opt_params)
-
-        if opt_cls is None:
-            try:
-                opt_cls = FusedAdam
-            except NameError:  # in case we don't have apex
-                opt_cls = torch.optim.Adam
-
-        scheduler_kwargs = {}
-        if scheduler_cls is None:
-            scheduler_cls = torch.optim.lr_scheduler.CosineAnnealingLR
-            scheduler_kwargs["T_max"] = self.max_epoch
-        if scheduler_params is not None:
-            scheduler_kwargs.update(scheduler_params)
-
-        optimizer = opt_cls(model.parameters(), **opt_kwargs)
-        scheduler = scheduler_cls(optimizer, **scheduler_kwargs)
-        return (optimizer, scheduler)
-
     def load_checkpoint(self, epoch: int | None = None) -> int:
         """Try to load model state from a checkpoint.
 
@@ -377,7 +308,7 @@ def load_checkpoint(self, epoch: int | None = None) -> int:
         """
         if self.checkpoint_dir is None:
             raise ValueError("checkpoint_dir must be set in order to load checkpoints.")
-        metadata = {"total_samples_trained": self.total_samples_trained}
+        metadata = {}
         self.epoch = load_checkpoint(
             self.checkpoint_dir,
             models=self.model,
diff --git a/examples/weather/temporal_interpolation/validate.py b/examples/weather/temporal_interpolation/validate.py