NVIDIA
diff --git a/‎bionemo-recipes/recipes/llama3/checkpoint.py‎
Lines changed: 566 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3/checkpoint.py‎
Lines changed: 566 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/L0_convergence_ddp.yaml‎
Lines changed: 81 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3/hydra_config/L0_convergence_ddp.yaml‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/L0_convergence_fsdp2.yaml‎
Lines changed: 81 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3/hydra_config/L0_convergence_fsdp2.yaml‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/L0_sanity.yaml‎
Lines changed: 44 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3/hydra_config/L0_sanity.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/defaults.yaml‎
Lines changed: 78 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3/hydra_config/defaults.yaml‎
Lines changed: 78 additions & 0 deletions
@@ -0,0 +1,81 @@
+# @package _global_
+
+# Convergence test configuration for DDP with tiny Llama model (~10M params)
+# Tests that the model can overfit on a small 200MB dataset
+# Single GPU version
+
+defaults:
+  - defaults
+  - _self_
+
+# Use tiny Llama config for fast convergence testing
+model_tag: /workspaces/bionemo-framework/bionemo-recipes/recipes/llama3/tiny_llama_config
+
+# Training steps - enough to see convergence on small dataset
+num_train_steps: 1000
+
+# Dataset configuration - use 2MB subset
+dataset:
+  tokenizer_path: /workspaces/bionemo-framework/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer
+  micro_batch_size: 1  # Conservative for single GPU
+  num_workers: 2
+  max_seq_length: 8192  # Full Llama3 context length
+  stride: 400  # 400bp overlap for 8K context
+  buffer_size: 10_000  # Smaller buffer for faster iteration
+  use_lazy_tokenization: true
+  load_dataset_kwargs:
+    path: "parquet"
+    data_files: "/workspaces/bionemo-framework/data/genomic_sequences_2mb.parquet"
+    split: "train"
+    streaming: true  # Use streaming to avoid loading entire dataset into memory
+
+# Optimizer - higher LR for faster convergence on small model
+adamw_kwargs:
+  lr: 5e-4  # Higher than default for faster convergence
+  fused: true
+  betas: [0.9, 0.98]
+  eps: 1e-8
+  weight_decay: 0.01
+
+# Learning rate scheduler
+lr_scheduler_kwargs:
+  num_warmup_steps: 100  # Quick warmup (10% of training)
+  num_training_steps: 1000
+
+# Checkpoint configuration - disabled for fast convergence testing
+checkpoint:
+  ckpt_dir: null  # No checkpoints
+  save_final_model: false  # Don't save final model
+  resume_from_checkpoint: false  # Start fresh for convergence test
+  save_every_n_steps: null  # No intermediate checkpoints
+
+# Logging - frequent logging to track convergence
+logger:
+  frequency: 10  # Log every 10 steps
+
+# WandB configuration
+wandb_init_args:
+  project: "llama3-genomic-convergence"
+  name: "tiny-llama-ddp-convergence-test"
+  mode: "online"  # Online mode for real-time dashboard
+  tags:
+    - convergence-test
+    - ddp
+    - tiny-model
+    - 10M-params
+    - single-gpu
+    - 8192-context
+
+# Meta device and torch compile
+use_meta_device: false
+use_torch_compile: false  # Disable for debugging
+
+# FP8 configuration - disabled for convergence testing
+fp8_config:
+  enabled: false
+  fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+  fp8_format: "HYBRID"
+  fp8_recipe_kwargs: {}
+  fp8_model_init_kwargs:
+    enabled: false
+
@@ -0,0 +1,81 @@
+# @package _global_
+
+# Convergence test configuration for FSDP2 with tiny Llama model (~10M params)
+# Tests that the model can overfit on a small 200MB dataset
+# Works with single GPU (no sharding) or multi-GPU (sharded)
+
+defaults:
+  - defaults
+  - _self_
+
+# Use tiny Llama config for fast convergence testing
+model_tag: /workspaces/bionemo-framework/bionemo-recipes/recipes/llama3/tiny_llama_config
+
+# Training steps - enough to see convergence on small dataset
+num_train_steps: 1000
+
+# Dataset configuration - use 2MB subset
+dataset:
+  tokenizer_path: /workspaces/bionemo-framework/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer
+  micro_batch_size: 1  # Conservative for single GPU
+  num_workers: 2
+  max_seq_length: 8192  # Full Llama3 context length
+  stride: 400  # 400bp overlap for 8K context
+  buffer_size: 10_000  # Smaller buffer for faster iteration
+  use_lazy_tokenization: true
+  load_dataset_kwargs:
+    path: "parquet"
+    data_files: "/workspaces/bionemo-framework/data/genomic_sequences_2mb.parquet"
+    split: "train"
+    streaming: true  # Use streaming to avoid loading entire dataset into memory
+
+# Optimizer - higher LR for faster convergence on small model
+adamw_kwargs:
+  lr: 5e-4  # Higher than default for faster convergence
+  fused: true
+  betas: [0.9, 0.98]
+  eps: 1e-8
+  weight_decay: 0.01
+
+# Learning rate scheduler
+lr_scheduler_kwargs:
+  num_warmup_steps: 100  # Quick warmup (10% of training)
+  num_training_steps: 1000
+
+# Checkpoint configuration - disabled for fast convergence testing
+checkpoint:
+  ckpt_dir: null  # No checkpoints
+  save_final_model: false  # Don't save final model
+  resume_from_checkpoint: false  # Start fresh for convergence test
+  save_every_n_steps: null  # No intermediate checkpoints
+
+# Logging - frequent logging to track convergence
+logger:
+  frequency: 10  # Log every 10 steps
+
+# WandB configuration
+wandb_init_args:
+  project: "llama3-genomic-convergence"
+  name: "tiny-llama-fsdp2-convergence-test"
+  mode: "online"  # Online mode for real-time dashboard
+  tags:
+    - convergence-test
+    - fsdp2
+    - tiny-model
+    - 10M-params
+    - single-node
+    - 8192-context
+
+# Meta device and torch compile
+use_meta_device: false
+use_torch_compile: false  # Disable for debugging
+
+# FP8 configuration - disabled for convergence testing
+fp8_config:
+  enabled: false
+  fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+  fp8_format: "HYBRID"
+  fp8_recipe_kwargs: {}
+  fp8_model_init_kwargs:
+    enabled: false
+
@@ -0,0 +1,44 @@
+defaults:
+  - defaults
+  - _self_
+
+# Training config
+model_tag: ./small_llama_config  # Use small Llama config for testing (4 layers, 2048 hidden)
+num_train_steps: 250
+
+# We want this on in CI/CD to validate that the script runs successfully with torch.compile.
+use_torch_compile: false  # Disable for faster startup during testing
+
+dataset:
+  tokenizer_path: /workspaces/bionemo-framework/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer
+  micro_batch_size: 1  # Small batch size for limited GPU memory
+  num_workers: 1
+  max_seq_length: 1024  # Smaller window for testing
+  stride: 100  # Smaller stride for testing
+  buffer_size: 10_000  # Smaller buffer for testing
+  use_lazy_tokenization: true
+  load_dataset_kwargs:
+    path: "parquet"
+    split: "train"
+    data_files: "test_genomic_sequences.parquet"  # Use local test file for now
+
+
+# WandB config
+wandb_init_args:
+  name: "llama3_8B_genomic_sanity"
+  mode: "offline"
+
+# Learning rate scheduler config
+lr_scheduler_kwargs:
+  num_warmup_steps: 10  # Shorter warmup for quick testing
+  num_training_steps: 250  # Match num_train_steps
+
+checkpoint:
+  ckpt_dir: null
+  resume_from_checkpoint: true
+  save_every_n_steps: 50
+  save_final_model: false
+
+logger:
+  frequency: 1
+
@@ -0,0 +1,78 @@
+# Training config
+model_tag: ??? # E.g., meta-llama/Meta-Llama-3-8B or a local path
+num_train_steps: ???
+
+# TODO: Once BIONEMO-2583 and BIONEMO-2719 are fixed, enable this by default and simplify training scripts to remove the
+# meta-device conditional.
+use_meta_device: false
+
+# Whether to wrap the model in torch.compile. Note, this is currently not supported with mfsdp (BIONEMO-2977).
+# We leave this off by default since we don't see much of a performance improvement with TE layers.
+use_torch_compile: false
+
+dataset:
+  tokenizer_path: /workspaces/bionemo-framework/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer
+  micro_batch_size: ???
+  num_workers: 1
+  max_seq_length: 8192  # Window size for genomic sequences
+  stride: 200  # Overlap for windowing
+  buffer_size: 500_000  # Shuffle buffer size
+  use_lazy_tokenization: true
+  load_dataset_kwargs:
+    path: "parquet"
+    split: "train"
+    streaming: True
+
+# WandB config
+wandb_init_args:
+  name: ???
+
+# mFSDP config
+fully_shard_kwargs:
+  zero_dp_strategy: "optim_grads_params"
+  calculate_per_token_loss: false
+  init_model_with_meta_device: ${use_meta_device}
+  check_for_nan_in_grad: true
+  grad_reduce_in_fp32: false
+  preserve_fp32_weights: true
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  sync_model_each_microbatch: true
+  average_in_collective: false
+
+# TransformerEngine FP8 config. See
+# https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more information on
+# supported formats.
+fp8_config:
+  enabled: false
+  fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+  fp8_format: "HYBRID"
+  fp8_recipe_kwargs: {}
+  fp8_model_init_kwargs:
+    enabled: false # If this is set to true, fp8_config.enabled must also be set to true.
+
+# Optimizer config
+adamw_kwargs:
+  lr: 4e-4
+  fused: true
+  betas: [0.9, 0.98]
+  eps: 1e-8
+  weight_decay: 0.01
+
+# Learning rate scheduler config
+lr_scheduler_kwargs:
+  num_warmup_steps: 2_000
+  num_training_steps: 500_000
+
+# Checkpoint config
+checkpoint:
+  ckpt_dir: ???
+  save_final_model: true
+  resume_from_checkpoint: true
+  save_every_n_steps: 50
+
+logger:
+  frequency: 100
+
+
+