add flash-attn

pstjohn · pstjohn · commit 6bfe76ed8aae · 2025-11-20T07:53:26.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_native_te/Dockerfile.cuda b/bionemo-recipes/recipes/esm2_native_te/Dockerfile.cuda
@@ -1,24 +1,34 @@
-# An example, minimal Dockerfile to install dependencies in a fresh python environment with CUDA support.
+# An example, minimal Dockerfile to install dependencies in a fresh python environment with CUDA support. This image
+# ends up with two copies of CUDA libraries; the first is the one installed by the base image, and the second is brought
+# in when we pip install torch.
 
 FROM nvcr.io/nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
 
 ENV UV_LINK_MODE=copy
 SHELL ["/bin/bash", "-c"]
 
-RUN mkdir -p /workspace && chown -R ubuntu:ubuntu /workspace
-USER ubuntu
+# Install torch, transformer-engine, and flash-attn
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements.txt,target=/requirements.txt \
     --mount=from=ghcr.io/astral-sh/uv,source=/uv,target=/bin/uv \
     <<EOF
 uv venv --python 3.12 --seed /workspace/.venv
 source /workspace/.venv/bin/activate
 uv pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu130
-uv pip install wheel packaging
+uv pip install wheel packaging psutil
+pip install --no-build-isolation flash-attn
 pip install --no-build-isolation transformer-engine[pytorch]==2.9.0
-uv pip install -r /requirements.txt
 EOF
 
+# Install recipe-specific dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements.txt,target=/requirements.txt \
+    --mount=from=ghcr.io/astral-sh/uv,source=/uv,target=/bin/uv \
+    uv pip install -r /requirements.txt
+
+USER ubuntu
+RUN chown -R ubuntu:ubuntu /workspace
+
 ENV PATH="/workspace/.venv/bin:$PATH"
 WORKDIR /workspace/bionemo
diff --git a/bionemo-recipes/recipes/esm2_native_te/README.md b/bionemo-recipes/recipes/esm2_native_te/README.md
@@ -50,7 +50,8 @@ CUDA 13.0):
 uv venv --python 3.12 --seed /workspace/.venv
 source /workspace/.venv/bin/activate
 uv pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu130
-uv pip install wheel packaging
+uv pip install wheel packaging psutil
+pip install --no-build-isolation flash-attn
 pip install --no-build-isolation transformer-engine[pytorch]==2.9.0
 uv pip install -r /requirements.txt
 ```
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py
@@ -73,6 +73,7 @@ def test_checkpoint_save_and_load_single_process_ddp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -121,6 +122,7 @@ def test_checkpoint_save_and_load_single_process_ddp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -205,6 +207,7 @@ def test_checkpoint_save_and_load_two_processes_ddp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=false",  # Start fresh
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -268,6 +271,7 @@ def test_checkpoint_save_and_load_two_processes_ddp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -346,6 +350,7 @@ def test_checkpoint_save_and_load_single_process_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -390,6 +395,7 @@ def test_checkpoint_save_and_load_single_process_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -457,6 +463,7 @@ def test_checkpoint_save_and_load_two_processes_mfsdp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=false",  # Start fresh
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -503,6 +510,7 @@ def test_checkpoint_save_and_load_two_processes_mfsdp(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -559,6 +567,7 @@ def test_checkpoint_save_and_load_single_process_fsdp2(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh
@@ -668,6 +677,7 @@ def test_checkpoint_save_and_load_two_processes_fsdp2(recipe_path, tmp_path):
         "num_train_steps=10",
         "checkpoint.save_every_n_steps=5",
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -714,6 +724,7 @@ def test_checkpoint_save_and_load_two_processes_fsdp2(recipe_path, tmp_path):
         "checkpoint.save_every_n_steps=5",
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "dataset.use_stateful_dataloader=true",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)
@@ -797,6 +808,7 @@ def test_final_model_save_mfsdp(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=3",
                 "checkpoint.save_final_model=true",
             ],
@@ -831,6 +843,7 @@ def test_final_model_save_fsdp2(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "checkpoint.save_final_model=true",
                 "num_train_steps=3",
             ],
@@ -874,6 +887,7 @@ def test_scheduler_resume_single_gpu(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=10",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=false",  # Start fresh, don't look for checkpoints
@@ -891,6 +905,7 @@ def test_scheduler_resume_single_gpu(recipe_path, tmp_path):
             overrides=[
                 f"checkpoint.ckpt_dir={temp_dir}",
                 f"+wandb_init_args.dir={tmp_path}",
+                f"hydra.run.dir={tmp_path}",
                 "num_train_steps=15",
                 "checkpoint.save_every_n_steps=5",
                 "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
@@ -951,6 +966,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
         "checkpoint.resume_from_checkpoint=false",  # Start fresh, don't look for checkpoints
         "lr_scheduler_kwargs.num_warmup_steps=20",
         "lr_scheduler_kwargs.num_training_steps=100",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result1 = subprocess.run(cmd_phase1, check=False, capture_output=True, text=True, env=env)
@@ -974,6 +990,7 @@ def test_scheduler_resume_two_gpu(recipe_path, tmp_path):
         "checkpoint.resume_from_checkpoint=true",  # Resume from checkpoint
         "lr_scheduler_kwargs.num_warmup_steps=20",
         "lr_scheduler_kwargs.num_training_steps=100",
+        f"hydra.run.dir={tmp_path}",
     ]
 
     result2 = subprocess.run(cmd_phase2, check=False, capture_output=True, text=True, env=env)