starting work for swqa

pstjohn · pstjohn · commit 1de5e9d7a60d · 2025-11-21T07:29:40.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

updated multistage dockerfile

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

revert to single stage image

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

add flash-attn

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

update readme with local dataset download

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

update readme with local dataset download

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;

update readme with local dataset download

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/.devcontainer/recipes/devcontainer.json b/.devcontainer/recipes/devcontainer.json
@@ -33,7 +33,7 @@
                 "ms-toolsai.jupyter",
                 "eamodio.gitlens",
                 "tamasfe.even-better-toml",
-                "streetsidesoftware.code-spell-checker"
+                "streetsidesoftware.code-spell-checker""ms-azuretools.vscode-docker",
             ]
         }
     }
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/.dockerignore b/bionemo-recipes/recipes/esm2_accelerate_te/.dockerignore
@@ -7,3 +7,4 @@ __pycache__
 .pytest_cache
 .ruff.toml
 .dockerignore
+.venv/
diff --git a/bionemo-recipes/recipes/esm2_native_te/.dockerignore b/bionemo-recipes/recipes/esm2_native_te/.dockerignore
@@ -7,3 +7,4 @@ __pycache__
 .pytest_cache
 .ruff.toml
 .dockerignore
+.venv/
diff --git a/bionemo-recipes/recipes/esm2_native_te/Dockerfile b/bionemo-recipes/recipes/esm2_native_te/Dockerfile
@@ -1,8 +1,7 @@
 # syntax=docker/dockerfile:1.4
 FROM nvcr.io/nvidia/pytorch:25.10-py3
 
-RUN --mount=type=secret,id=netrc,target=/root/.netrc \
-    --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/requirements.txt \
     PIP_CONSTRAINT= pip install -r /requirements.txt
 
diff --git a/bionemo-recipes/recipes/esm2_native_te/Dockerfile.cuda b/bionemo-recipes/recipes/esm2_native_te/Dockerfile.cuda
@@ -0,0 +1,31 @@
+# An example, minimal Dockerfile to install dependencies in a fresh python environment with CUDA support. This image
+# ends up with two copies of CUDA libraries; the first is the one installed by the base image, and the second is brought
+# in when we pip install torch.
+
+FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
+
+ENV UV_LINK_MODE=copy
+SHELL ["/bin/bash", "-c"]
+
+# Install torch, transformer-engine, and flash-attn
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=from=ghcr.io/astral-sh/uv,source=/uv,target=/bin/uv \
+    <<EOF
+uv venv --python 3.12 --seed /workspace/.venv
+source /workspace/.venv/bin/activate
+uv pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu130
+uv pip install wheel packaging psutil
+pip install --no-build-isolation "flash-attn>=2.1.1,<=2.8.1"
+pip install --no-build-isolation transformer-engine[pytorch]==2.9.0
+EOF
+
+# Install recipe-specific dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements.txt,target=/requirements.txt \
+    --mount=from=ghcr.io/astral-sh/uv,source=/uv,target=/bin/uv \
+    source /workspace/.venv/bin/activate && uv pip install -r /requirements.txt
+
+ENV PATH="/workspace/.venv/bin:$PATH"
+WORKDIR /workspace/bionemo
diff --git a/bionemo-recipes/recipes/esm2_native_te/README.md b/bionemo-recipes/recipes/esm2_native_te/README.md
@@ -27,11 +27,50 @@ bionemo-framework repository. You can download a zipped directory of this folder
 \[1\]: Requires [compute capability](https://developer.nvidia.com/cuda-gpus) 9.0 and above (Hopper+) <br/>
 \[2\]: Requires [compute capability](https://developer.nvidia.com/cuda-gpus) 10.0 and 10.3 (Blackwell), 12.0 support pending <br/>
 
+### Installing Dependencies
+
+The easiest way to get started with this recipe is to use the provided Dockerfile, which uses the latest NVIDIA PyTorch
+base image to provide optimized versions of PyTorch and TransformerEngine. To build the container, run:
+
+```bash
+docker build -t esm2_native_te .
+```
+
+To run the container, run:
+
+```bash
+docker run -it --gpus all --network host --ipc=host --rm -v ${PWD}:/workspace/bionemo esm2_native_te /bin/bash
+```
+
+Alternatively, the dependencies can be installed manually in an environment with CUDA support. See
+[Dockerfile.cuda](Dockerfile.cuda) for the process of installing dependencies in a fresh python environment (for e.g.,
+CUDA 13.0):
+
+```bash
+uv venv --python 3.12 --seed /workspace/.venv
+source /workspace/.venv/bin/activate
+uv pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu130
+uv pip install wheel packaging psutil
+pip install --no-build-isolation "flash-attn>=2.1.1,<=2.8.1"
+pip install --no-build-isolation transformer-engine[pytorch]==2.9.0
+uv pip install -r /requirements.txt
+```
+
+To build and run the CUDA base container, run:
+
+```bash
+docker build -t esm2_native_te_cuda -f Dockerfile.cuda .
+docker run -it --gpus all --network host --ipc=host --rm -v ${PWD}:/workspace/bionemo esm2_native_te_cuda /bin/bash -c "pytest -v ."
+```
+
 ### Performance Benchmarks
 
 ![Performance Benchmarks](../../../docs/docs/assets/images/esm2/esm2_native_te_benchmarks.svg)
 
-Note: "compiled" refers to `torch.compile`. "fa2" is [FlashAttention2](https://github.com/Dao-AILab/flash-attention). Recently, we measured 2800 tokens/second/GPU training speed on H100 with HuggingFace Transformers's ESM-2 implementation of THD sequence packing, however we have not been able to make this configuration work on Blackwell and this work is still in progress.
+Note: "compiled" refers to `torch.compile`. "fa2" is [FlashAttention2](https://github.com/Dao-AILab/flash-attention).
+Recently, we measured 2800 tokens/second/GPU training speed on H100 with HuggingFace Transformers's ESM-2 implementation
+of THD sequence packing, however we have not been able to make this configuration work on Blackwell and this work is
+still in progress.
 
 ### Distributed Training
 
@@ -97,6 +136,40 @@ model tag:
 python train_fsdp2.py --config-name L0_sanity model_tag=facebook/esm2_t6_8M_UR50D
 ```
 
+## Downloading Pre-Training Data For Offline Training
+
+An example pre-training dataset for ESM-2 is available in the
+[`nvidia/esm2_uniref_pretraining_data`](https://huggingface.co/datasets/nvidia/esm2_uniref_pretraining_data) Hugging
+Face dataset. This dataset can be [streamed](https://huggingface.co/docs/datasets/en/stream) from the Hugging Face Hub via
+
+```python
+>>> from datasets import load_dataset
+>>> dataset = load_dataset('nvidia/esm2_uniref_pretraining_data', split='train', streaming=True)
+>>> print(next(iter(dataset)))
+{'sequence': 'MSPRRTGGARPPGPCTPCGPRPRCPSRRSAAARPAPSAAPARRARPGRRPGCRPGTDCPGTARRPGGGP...',
+ 'ur50_id': 'UniRef50_A0A081XN86',
+ 'ur90_id': 'UniRef90_UPI002FBE17D9'}
+```
+
+For large-scale training, the dataset should be downloaded locally via the [huggingface
+CLI](https://huggingface.co/docs/huggingface_hub/guides/download#download-from-the-cli), with appropriate values set for
+`HF_HOME` and `HF_TOKEN` environment variables. Use `uv tool install huggingface_hub` to install the CLI if not already
+installed.
+
+```bash
+export HF_TOKEN=<your_huggingface_token>
+hf download nvidia/esm2_uniref_pretraining_data --repo-type dataset --local-dir /path/to/download/directory
+# Test to ensure the dataset can be loaded correctly
+python -c "import datasets; datasets.load_dataset('/path/to/download/directory', split='train', streaming=True)"
+```
+
+Pass the downloaded dataset directory to the training script as the `dataset.path` configuration parameter.
+
+```bash
+HF_DATASETS_OFFLINE=1 python train_fsdp2.py --config-name L0_sanity \
+  dataset.load_dataset_kwargs.path=/path/to/download/directory
+```
+
 ## Saving and Loading Checkpoints
 
 To enable checkpoint saving, ensure that `checkpoint.ckpt_dir` is set to a writable directory. Checkpointing frequency is
diff --git a/bionemo-recipes/recipes/esm2_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/esm2_native_te/hydra_config/defaults.yaml
@@ -69,7 +69,7 @@ checkpoint:
   ckpt_dir: ???
   save_final_model: true
   resume_from_checkpoint: true
-  save_every_n_steps: 50
+  save_every_n_steps: 1_000
 
 logger:
   frequency: 100
diff --git a/bionemo-recipes/recipes/esm2_native_te/requirements.txt b/bionemo-recipes/recipes/esm2_native_te/requirements.txt
@@ -1,6 +1,7 @@
 datasets
 hydra-core
 megatron-fsdp
+pytest
 torch
 torchao!=0.14.0
 torchdata
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py
@@ -73,6 +73,7 @@ def test_checkpoint_save_and_load_single_process_ddp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -121,6 +122,7 @@ def test_checkpoint_save_and_load_single_process_ddp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -205,6 +207,7 @@ def test_checkpoint_save_and_load_two_processes_ddp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=false",  # Start fresh
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -268,6 +271,7 @@ def test_checkpoint_save_and_load_two_processes_ddp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -346,6 +350,7 @@ def test_checkpoint_save_and_load_single_process_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -390,6 +395,7 @@ def test_checkpoint_save_and_load_single_process_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -457,6 +463,7 @@ def test_checkpoint_save_and_load_two_processes_mfsdp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=false",  # Start fresh
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -503,6 +510,7 @@ def test_checkpoint_save_and_load_two_processes_mfsdp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -559,6 +567,7 @@ def test_checkpoint_save_and_load_single_process_fsdp2(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -668,6 +677,7 @@ def test_checkpoint_save_and_load_two_processes_fsdp2(recipe_path, tmp_path):
         "num_train_steps=10",
         "checkpoint.save_every_n_steps=5",
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -714,6 +724,7 @@ def test_checkpoint_save_and_load_two_processes_fsdp2(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -797,6 +808,7 @@ def test_final_model_save_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=3",
                 "checkpoint.save_final_model=true",
             ],
@@ -831,6 +843,7 @@ def test_final_model_save_fsdp2(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "checkpoint.save_final_model=true",
                 "num_train_steps=3",
             ],
@@ -874,6 +887,7 @@ def test_scheduler_resume_single_gpu(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh, don't look for checkpoints
@@ -891,6 +905,7 @@ def test_scheduler_resume_single_gpu(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -951,6 +966,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
         "checkpoint.resume_from_checkpoint=false",  # Start fresh, don't look for checkpoints
         "lr_scheduler_kwargs.num_warmup_steps=20",
         "lr_scheduler_kwargs.num_training_steps=100",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -974,6 +990,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "lr_scheduler_kwargs.num_warmup_steps=20",
         "lr_scheduler_kwargs.num_training_steps=100",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
diff --git a/bionemo-recipes/recipes/geneformer_native_te_mfsdp_fp8/.dockerignore b/bionemo-recipes/recipes/geneformer_native_te_mfsdp_fp8/.dockerignore
@@ -11,3 +11,4 @@ __pycache__
 wandb
 .**
 *.sqsh
+.venv/

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`"ms-toolsai.jupyter",`
`34`	`34`	`"eamodio.gitlens",`
`35`	`35`	`"tamasfe.even-better-toml",`
`36`		`- "streetsidesoftware.code-spell-checker"`
	`36`	`+ "streetsidesoftware.code-spell-checker""ms-azuretools.vscode-docker",`
`37`	`37`	`]`
`38`	`38`	`}`
`39`	`39`	`}`
-Original file line number
+Diff line change
 wandb
 .**
 *.sqsh
 +.venv/