Fix tests to work with use_stateful_dataloader config

savitha-eng · savitha-eng · commit 548bb0268a16 · 2025-11-15T02:06:05.000Z
- Disable resume_from_checkpoint in convergence tests (test_train.py)
  - These tests don't need checkpointing, just convergence validation
  - Prevents NoneType error when use_stateful_dataloader=false
- Enable use_stateful_dataloader in checkpointing tests (test_train_two_gpu.py)
  - Required for checkpoint save/resume functionality
  - Ensures dataloader state is preserved across checkpoints
- Add use_stateful_dataloader to scheduler resume test (test_distributed_checkpointing.py)
  - Needed for phase 2 resume to work correctly

All 26 tests now pass.

Signed-off-by: Savitha Srinivasan &lt;savithas@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/llama3/tests/test_distributed_checkpointing.py
@@ -737,6 +737,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "lr_scheduler_kwargs.num_warmup_steps=20",
         "lr_scheduler_kwargs.num_training_steps=100",
+        "dataset.use_stateful_dataloader=true",  # Enable for checkpoint testing
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
diff --git a/bionemo-recipes/recipes/llama3/tests/test_train.py b/bionemo-recipes/recipes/llama3/tests/test_train.py
@@ -18,7 +18,6 @@
 import pytest
 import torch
 from hydra import compose, initialize_config_dir
-
 from train_ddp import main as main_ddp
 from train_fsdp2 import main as main_fsdp2
 
@@ -34,7 +33,7 @@ def set_seed():
 
 def test_sanity_convergence_ddp(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that DDP training converges on mock genomic data.
-    
+
     This test validates:
     - The train_ddp.py script runs end-to-end without errors
     - Model, optimizer, and dataloader integrate correctly
@@ -49,19 +48,20 @@ def test_sanity_convergence_ddp(tmp_path, recipe_path, mock_genomic_parquet):
                 f"+wandb_init_args.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_ddp(sanity_config)
-    
+
     # For genomic Causal LM, we expect convergence to < 5.0 on the small test dataset
     # The model should learn to predict simple patterns in the mock data
     assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
 
 
 def test_sanity_convergence_fsdp2(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that FSDP2 training converges on mock genomic data.
-    
+
     This test validates:
     - The train_fsdp2.py script runs end-to-end without errors
     - FSDP2 wrapping and sharding work correctly
@@ -76,18 +76,19 @@ def test_sanity_convergence_fsdp2(tmp_path, recipe_path, mock_genomic_parquet):
                 f"+wandb_init_args.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_fsdp2(sanity_config)
-    
+
     # FSDP2 should achieve similar convergence to DDP
     assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
 
 
 def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that DDP training works with non-streaming dataset.
-    
+
     This test validates:
     - The dataloader works correctly with streaming=False
     - Map-style dataset integration works
@@ -102,18 +103,19 @@ def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path, moc
                 f"checkpoint.ckpt_dir={tmp_path}",
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
                 "dataset.load_dataset_kwargs.streaming=False",
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_ddp(sanity_config)
-    
+
     # Non-streaming mode should converge just as well as streaming
     assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
 
 
 def test_sanity_convergence_fsdp2_non_streaming_dataset(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that FSDP2 training works with non-streaming dataset.
-    
+
     This test validates:
     - FSDP2 works correctly with map-style datasets
     - Non-streaming mode doesn't break FSDP2 sharding
@@ -128,18 +130,19 @@ def test_sanity_convergence_fsdp2_non_streaming_dataset(tmp_path, recipe_path, m
                 f"checkpoint.ckpt_dir={tmp_path}",
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
                 "dataset.load_dataset_kwargs.streaming=False",
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_fsdp2(sanity_config)
-    
+
     # Non-streaming mode should converge just as well as streaming
     assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
 
 
 def test_sanity_ddp_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that DDP training works with lazy tokenization enabled.
-    
+
     This test validates:
     - Lazy tokenization (one-to-one mapping) works correctly
     - Training can run with lazy tokenization
@@ -155,19 +158,20 @@ def test_sanity_ddp_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_p
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
                 "dataset.use_lazy_tokenization=True",
                 "num_train_steps=10",  # Just verify it runs, don't test convergence
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_ddp(sanity_config)
-    
+
     # Just check that training runs without errors
     # We don't check convergence because lazy tokenization produces different windowing
     assert final_loss is not None, "Training should complete and return a loss value"
 
 
 def test_sanity_fsdp2_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_parquet):
     """Test that FSDP2 training works with lazy tokenization enabled.
-    
+
     This test validates:
     - Lazy tokenization works with FSDP2
     - FSDP2 sharding doesn't break with lazy tokenization
@@ -183,12 +187,11 @@ def test_sanity_fsdp2_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic
                 f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
                 "dataset.use_lazy_tokenization=True",
                 "num_train_steps=10",  # Just verify it runs, don't test convergence
+                "checkpoint.resume_from_checkpoint=false",  # Don't try to resume - fresh training
             ],
         )
 
     final_loss = main_fsdp2(sanity_config)
-    
+
     # Just check that training runs without errors
     assert final_loss is not None, "Training should complete and return a loss value"
-
-
diff --git a/bionemo-recipes/recipes/llama3/tests/test_train_two_gpu.py b/bionemo-recipes/recipes/llama3/tests/test_train_two_gpu.py
@@ -40,11 +40,11 @@
 
 def run_train_cmd(cmd, recipe_path):
     """Run a training command and check for errors.
-    
+
     Args:
         cmd: List of command arguments to run
         recipe_path: Path to the recipe directory (working directory for command)
-        
+
     Raises:
         pytest.fail: If command returns non-zero exit code
     """
@@ -67,23 +67,25 @@ def run_train_cmd(cmd, recipe_path):
 @requires_multi_gpu
 def test_multi_gpu_train_ddp(tmp_path, recipe_path):
     """Test DDP training on 2 GPUs.
-    
+
     This test validates:
     - DDP launches successfully with 2 processes
     - Both GPUs are utilized
     - Training completes without errors
     - Gradient synchronization works across GPUs
-    
+
     The test runs only 4 training steps for speed.
     """
     run_train_cmd(
         [
             "torchrun",
-            "--nproc_per_node", "2",    # 2 processes = 2 GPUs
-            "--standalone",              # Single node mode
+            "--nproc_per_node",
+            "2",  # 2 processes = 2 GPUs
+            "--standalone",  # Single node mode
             "train_ddp.py",
-            "--config-name", "L0_sanity",
-            "num_train_steps=4",         # Just 4 steps for speed
+            "--config-name",
+            "L0_sanity",
+            "num_train_steps=4",  # Just 4 steps for speed
         ],
         recipe_path,
     )
@@ -92,32 +94,34 @@ def test_multi_gpu_train_ddp(tmp_path, recipe_path):
 @requires_multi_gpu
 def test_multi_gpu_train_fsdp2(tmp_path, recipe_path):
     """Test FSDP2 training on 2 GPUs.
-    
+
     This test validates:
     - FSDP2 launches successfully with 2 processes
     - Model sharding works across 2 GPUs
     - Training completes without errors
     - Parameter gathering/scattering works correctly
-    
+
     The test runs only 4 training steps for speed.
     """
     run_train_cmd(
         [
             "torchrun",
-            "--nproc_per_node", "2",    # 2 processes = 2 GPUs
-            "--standalone",              # Single node mode
+            "--nproc_per_node",
+            "2",  # 2 processes = 2 GPUs
+            "--standalone",  # Single node mode
             "train_fsdp2.py",
-            "--config-name", "L0_sanity",
-            "num_train_steps=4",         # Just 4 steps for speed
+            "--config-name",
+            "L0_sanity",
+            "num_train_steps=4",  # Just 4 steps for speed
         ],
         recipe_path,
     )
 
 
-@requires_multi_gpu  
+@requires_multi_gpu
 def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
     """Test DDP training on 2 GPUs with checkpoint saving.
-    
+
     This test validates:
     - DDP can save checkpoints with multiple processes
     - Checkpoint files are created correctly
@@ -126,17 +130,20 @@ def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
     run_train_cmd(
         [
             "torchrun",
-            "--nproc_per_node", "2",
+            "--nproc_per_node",
+            "2",
             "--standalone",
             "train_ddp.py",
-            "--config-name", "L0_sanity",
+            "--config-name",
+            "L0_sanity",
             "num_train_steps=10",
             f"checkpoint.ckpt_dir={tmp_path}",
             "checkpoint.save_every_n_steps=5",
+            "dataset.use_stateful_dataloader=true",  # Enable for checkpoint testing
         ],
         recipe_path,
     )
-    
+
     # Verify checkpoint was created
     ckpt_dir = tmp_path / "train_ddp"
     assert ckpt_dir.exists(), f"Checkpoint directory not created: {ckpt_dir}"
@@ -146,7 +153,7 @@ def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
 @requires_multi_gpu
 def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
     """Test FSDP2 training on 2 GPUs with checkpoint saving.
-    
+
     This test validates:
     - FSDP2 can save checkpoints with multiple processes
     - Sharded checkpoints are created correctly
@@ -155,21 +162,21 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
     run_train_cmd(
         [
             "torchrun",
-            "--nproc_per_node", "2",
+            "--nproc_per_node",
+            "2",
             "--standalone",
             "train_fsdp2.py",
-            "--config-name", "L0_sanity",
+            "--config-name",
+            "L0_sanity",
             "num_train_steps=10",
             f"checkpoint.ckpt_dir={tmp_path}",
             "checkpoint.save_every_n_steps=5",
+            "dataset.use_stateful_dataloader=true",  # Enable for checkpoint testing
         ],
         recipe_path,
     )
-    
+
     # Verify checkpoint was created
     ckpt_dir = tmp_path / "train_fsdp2"
     assert ckpt_dir.exists(), f"Checkpoint directory not created: {ckpt_dir}"
     assert (ckpt_dir / "step_5").exists(), "Checkpoint at step 5 not found"
-
-
-

Original file line number	Diff line number	Diff line change
`@@ -737,6 +737,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):`
`737`	`737`	`"checkpoint.resume_from_checkpoint=true", # Resume from checkpoint`
`738`	`738`	`"lr_scheduler_kwargs.num_warmup_steps=20",`
`739`	`739`	`"lr_scheduler_kwargs.num_training_steps=100",`
	`740`	`+ "dataset.use_stateful_dataloader=true", # Enable for checkpoint testing`
`740`	`741`	`]`
`741`	`742`
`742`	`743`	`result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)`