log total tokens in esm2 recipe (#1309)

pstjohn · web-flow · commit 541b254b26aa · 2025-11-14T20:02:34.000Z
In comparing THD and BSHD runs, it's helpful to have a view of the total
batch size (in terms of number of tokens). This logs the summed tokens
per batch to wandb

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_native_te/perf_logger.py b/bionemo-recipes/recipes/esm2_native_te/perf_logger.py
@@ -58,6 +58,7 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             "train/step_time": torchmetrics.MeanMetric(),
             "train/tokens_per_second": torchmetrics.MeanMetric(),
             "train/unpadded_tokens_per_second": torchmetrics.MeanMetric(),
+            "train/total_unpadded_tokens_per_batch": torchmetrics.SumMetric(),
             "train/perplexity": torchmetrics.text.Perplexity(ignore_index=-100),
             "train/gpu_memory_allocated_max_gb": torchmetrics.MaxMetric(),
             "train/gpu_memory_allocated_mean_gb": torchmetrics.MeanMetric(),
@@ -103,6 +104,7 @@ def log_step(
         self.metrics["train/step_time"].update(step_time)
         self.metrics["train/tokens_per_second"].update(num_tokens / step_time)
         self.metrics["train/unpadded_tokens_per_second"].update(num_unpadded_tokens / step_time)
+        self.metrics["train/total_unpadded_tokens_per_batch"].update(num_unpadded_tokens / self.logging_frequency)
 
         # Handle sequence packing for torchmetrics calculation.
         if outputs.logits.dim() < 3: