Skip to content

Commit 548bb02

Browse files
committed
Fix tests to work with use_stateful_dataloader config
- Disable resume_from_checkpoint in convergence tests (test_train.py) - These tests don't need checkpointing, just convergence validation - Prevents NoneType error when use_stateful_dataloader=false - Enable use_stateful_dataloader in checkpointing tests (test_train_two_gpu.py) - Required for checkpoint save/resume functionality - Ensures dataloader state is preserved across checkpoints - Add use_stateful_dataloader to scheduler resume test (test_distributed_checkpointing.py) - Needed for phase 2 resume to work correctly All 26 tests now pass. Signed-off-by: Savitha Srinivasan <[email protected]>
1 parent 827440b commit 548bb02

File tree

3 files changed

+52
-41
lines changed

3 files changed

+52
-41
lines changed

bionemo-recipes/recipes/llama3/tests/test_distributed_checkpointing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
737737
"checkpoint.resume_from_checkpoint=true", # Resume from checkpoint
738738
"lr_scheduler_kwargs.num_warmup_steps=20",
739739
"lr_scheduler_kwargs.num_training_steps=100",
740+
"dataset.use_stateful_dataloader=true", # Enable for checkpoint testing
740741
]
741742

742743
result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)

bionemo-recipes/recipes/llama3/tests/test_train.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import pytest
1919
import torch
2020
from hydra import compose, initialize_config_dir
21-
2221
from train_ddp import main as main_ddp
2322
from train_fsdp2 import main as main_fsdp2
2423

@@ -34,7 +33,7 @@ def set_seed():
3433

3534
def test_sanity_convergence_ddp(tmp_path, recipe_path, mock_genomic_parquet):
3635
"""Test that DDP training converges on mock genomic data.
37-
36+
3837
This test validates:
3938
- The train_ddp.py script runs end-to-end without errors
4039
- Model, optimizer, and dataloader integrate correctly
@@ -49,19 +48,20 @@ def test_sanity_convergence_ddp(tmp_path, recipe_path, mock_genomic_parquet):
4948
f"+wandb_init_args.dir={tmp_path}",
5049
f"checkpoint.ckpt_dir={tmp_path}",
5150
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
51+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
5252
],
5353
)
5454

5555
final_loss = main_ddp(sanity_config)
56-
56+
5757
# For genomic Causal LM, we expect convergence to < 5.0 on the small test dataset
5858
# The model should learn to predict simple patterns in the mock data
5959
assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
6060

6161

6262
def test_sanity_convergence_fsdp2(tmp_path, recipe_path, mock_genomic_parquet):
6363
"""Test that FSDP2 training converges on mock genomic data.
64-
64+
6565
This test validates:
6666
- The train_fsdp2.py script runs end-to-end without errors
6767
- FSDP2 wrapping and sharding work correctly
@@ -76,18 +76,19 @@ def test_sanity_convergence_fsdp2(tmp_path, recipe_path, mock_genomic_parquet):
7676
f"+wandb_init_args.dir={tmp_path}",
7777
f"checkpoint.ckpt_dir={tmp_path}",
7878
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
79+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
7980
],
8081
)
8182

8283
final_loss = main_fsdp2(sanity_config)
83-
84+
8485
# FSDP2 should achieve similar convergence to DDP
8586
assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
8687

8788

8889
def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path, mock_genomic_parquet):
8990
"""Test that DDP training works with non-streaming dataset.
90-
91+
9192
This test validates:
9293
- The dataloader works correctly with streaming=False
9394
- Map-style dataset integration works
@@ -102,18 +103,19 @@ def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path, moc
102103
f"checkpoint.ckpt_dir={tmp_path}",
103104
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
104105
"dataset.load_dataset_kwargs.streaming=False",
106+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
105107
],
106108
)
107109

108110
final_loss = main_ddp(sanity_config)
109-
111+
110112
# Non-streaming mode should converge just as well as streaming
111113
assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
112114

113115

114116
def test_sanity_convergence_fsdp2_non_streaming_dataset(tmp_path, recipe_path, mock_genomic_parquet):
115117
"""Test that FSDP2 training works with non-streaming dataset.
116-
118+
117119
This test validates:
118120
- FSDP2 works correctly with map-style datasets
119121
- Non-streaming mode doesn't break FSDP2 sharding
@@ -128,18 +130,19 @@ def test_sanity_convergence_fsdp2_non_streaming_dataset(tmp_path, recipe_path, m
128130
f"checkpoint.ckpt_dir={tmp_path}",
129131
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
130132
"dataset.load_dataset_kwargs.streaming=False",
133+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
131134
],
132135
)
133136

134137
final_loss = main_fsdp2(sanity_config)
135-
138+
136139
# Non-streaming mode should converge just as well as streaming
137140
assert final_loss < 5.0, f"Final loss {final_loss} is too high, expected < 5.0"
138141

139142

140143
def test_sanity_ddp_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_parquet):
141144
"""Test that DDP training works with lazy tokenization enabled.
142-
145+
143146
This test validates:
144147
- Lazy tokenization (one-to-one mapping) works correctly
145148
- Training can run with lazy tokenization
@@ -155,19 +158,20 @@ def test_sanity_ddp_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_p
155158
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
156159
"dataset.use_lazy_tokenization=True",
157160
"num_train_steps=10", # Just verify it runs, don't test convergence
161+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
158162
],
159163
)
160164

161165
final_loss = main_ddp(sanity_config)
162-
166+
163167
# Just check that training runs without errors
164168
# We don't check convergence because lazy tokenization produces different windowing
165169
assert final_loss is not None, "Training should complete and return a loss value"
166170

167171

168172
def test_sanity_fsdp2_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic_parquet):
169173
"""Test that FSDP2 training works with lazy tokenization enabled.
170-
174+
171175
This test validates:
172176
- Lazy tokenization works with FSDP2
173177
- FSDP2 sharding doesn't break with lazy tokenization
@@ -183,12 +187,11 @@ def test_sanity_fsdp2_with_lazy_tokenization(tmp_path, recipe_path, mock_genomic
183187
f"dataset.load_dataset_kwargs.data_files={mock_genomic_parquet}",
184188
"dataset.use_lazy_tokenization=True",
185189
"num_train_steps=10", # Just verify it runs, don't test convergence
190+
"checkpoint.resume_from_checkpoint=false", # Don't try to resume - fresh training
186191
],
187192
)
188193

189194
final_loss = main_fsdp2(sanity_config)
190-
195+
191196
# Just check that training runs without errors
192197
assert final_loss is not None, "Training should complete and return a loss value"
193-
194-

bionemo-recipes/recipes/llama3/tests/test_train_two_gpu.py

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@
4040

4141
def run_train_cmd(cmd, recipe_path):
4242
"""Run a training command and check for errors.
43-
43+
4444
Args:
4545
cmd: List of command arguments to run
4646
recipe_path: Path to the recipe directory (working directory for command)
47-
47+
4848
Raises:
4949
pytest.fail: If command returns non-zero exit code
5050
"""
@@ -67,23 +67,25 @@ def run_train_cmd(cmd, recipe_path):
6767
@requires_multi_gpu
6868
def test_multi_gpu_train_ddp(tmp_path, recipe_path):
6969
"""Test DDP training on 2 GPUs.
70-
70+
7171
This test validates:
7272
- DDP launches successfully with 2 processes
7373
- Both GPUs are utilized
7474
- Training completes without errors
7575
- Gradient synchronization works across GPUs
76-
76+
7777
The test runs only 4 training steps for speed.
7878
"""
7979
run_train_cmd(
8080
[
8181
"torchrun",
82-
"--nproc_per_node", "2", # 2 processes = 2 GPUs
83-
"--standalone", # Single node mode
82+
"--nproc_per_node",
83+
"2", # 2 processes = 2 GPUs
84+
"--standalone", # Single node mode
8485
"train_ddp.py",
85-
"--config-name", "L0_sanity",
86-
"num_train_steps=4", # Just 4 steps for speed
86+
"--config-name",
87+
"L0_sanity",
88+
"num_train_steps=4", # Just 4 steps for speed
8789
],
8890
recipe_path,
8991
)
@@ -92,32 +94,34 @@ def test_multi_gpu_train_ddp(tmp_path, recipe_path):
9294
@requires_multi_gpu
9395
def test_multi_gpu_train_fsdp2(tmp_path, recipe_path):
9496
"""Test FSDP2 training on 2 GPUs.
95-
97+
9698
This test validates:
9799
- FSDP2 launches successfully with 2 processes
98100
- Model sharding works across 2 GPUs
99101
- Training completes without errors
100102
- Parameter gathering/scattering works correctly
101-
103+
102104
The test runs only 4 training steps for speed.
103105
"""
104106
run_train_cmd(
105107
[
106108
"torchrun",
107-
"--nproc_per_node", "2", # 2 processes = 2 GPUs
108-
"--standalone", # Single node mode
109+
"--nproc_per_node",
110+
"2", # 2 processes = 2 GPUs
111+
"--standalone", # Single node mode
109112
"train_fsdp2.py",
110-
"--config-name", "L0_sanity",
111-
"num_train_steps=4", # Just 4 steps for speed
113+
"--config-name",
114+
"L0_sanity",
115+
"num_train_steps=4", # Just 4 steps for speed
112116
],
113117
recipe_path,
114118
)
115119

116120

117-
@requires_multi_gpu
121+
@requires_multi_gpu
118122
def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
119123
"""Test DDP training on 2 GPUs with checkpoint saving.
120-
124+
121125
This test validates:
122126
- DDP can save checkpoints with multiple processes
123127
- Checkpoint files are created correctly
@@ -126,17 +130,20 @@ def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
126130
run_train_cmd(
127131
[
128132
"torchrun",
129-
"--nproc_per_node", "2",
133+
"--nproc_per_node",
134+
"2",
130135
"--standalone",
131136
"train_ddp.py",
132-
"--config-name", "L0_sanity",
137+
"--config-name",
138+
"L0_sanity",
133139
"num_train_steps=10",
134140
f"checkpoint.ckpt_dir={tmp_path}",
135141
"checkpoint.save_every_n_steps=5",
142+
"dataset.use_stateful_dataloader=true", # Enable for checkpoint testing
136143
],
137144
recipe_path,
138145
)
139-
146+
140147
# Verify checkpoint was created
141148
ckpt_dir = tmp_path / "train_ddp"
142149
assert ckpt_dir.exists(), f"Checkpoint directory not created: {ckpt_dir}"
@@ -146,7 +153,7 @@ def test_multi_gpu_train_ddp_with_checkpointing(tmp_path, recipe_path):
146153
@requires_multi_gpu
147154
def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
148155
"""Test FSDP2 training on 2 GPUs with checkpoint saving.
149-
156+
150157
This test validates:
151158
- FSDP2 can save checkpoints with multiple processes
152159
- Sharded checkpoints are created correctly
@@ -155,21 +162,21 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
155162
run_train_cmd(
156163
[
157164
"torchrun",
158-
"--nproc_per_node", "2",
165+
"--nproc_per_node",
166+
"2",
159167
"--standalone",
160168
"train_fsdp2.py",
161-
"--config-name", "L0_sanity",
169+
"--config-name",
170+
"L0_sanity",
162171
"num_train_steps=10",
163172
f"checkpoint.ckpt_dir={tmp_path}",
164173
"checkpoint.save_every_n_steps=5",
174+
"dataset.use_stateful_dataloader=true", # Enable for checkpoint testing
165175
],
166176
recipe_path,
167177
)
168-
178+
169179
# Verify checkpoint was created
170180
ckpt_dir = tmp_path / "train_fsdp2"
171181
assert ckpt_dir.exists(), f"Checkpoint directory not created: {ckpt_dir}"
172182
assert (ckpt_dir / "step_5").exists(), "Checkpoint at step 5 not found"
173-
174-
175-

0 commit comments

Comments
 (0)