linting

jomitchellnv · jomitchellnv · commit 0d0f70006de8 · 2025-12-02T11:51:02.000-08:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
@@ -555,6 +555,7 @@ def test_sanity_ddp_thd_token_packing_huggingface_model(tmp_path, recipe_path):
 
     main_ddp(sanity_config)
 
+
 def test_sanity_fsdp2_cp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
     if torch.cuda.get_device_capability() == (12, 0):
         # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
@@ -573,4 +574,4 @@ def test_sanity_fsdp2_cp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
             ],
         )
 
-    main_fsdp2_cp(sanity_config)
+    main_fsdp2_cp(sanity_config)
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train_two_gpu.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train_two_gpu.py
@@ -146,6 +146,7 @@ def test_multi_gpu_train_te_ddp_cp(tmp_path, recipe_path):
         recipe_path,
     )
 
+
 @requires_multi_gpu
 @requires_datacenter_hardware
 def test_multi_gpu_train_te_fsdp2_cp(tmp_path, recipe_path):
@@ -161,4 +162,4 @@ def test_multi_gpu_train_te_fsdp2_cp(tmp_path, recipe_path):
             "cp_size=2",
         ],
         recipe_path,
-    )
+    )
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py
@@ -31,7 +31,7 @@
 from transformers.models.esm.modeling_esm import EsmForMaskedLM  # noqa: F401
 
 from checkpoint import load_checkpoint_fsdp2, save_checkpoint_fsdp2, save_final_model_fsdp2, should_save_checkpoint
-from dataset import create_bshd_dataloader, create_cp_dataloader, create_thd_dataloader
+from dataset import create_cp_dataloader
 from distributed_config import DistributedConfig
 from perf_logger import PerfLogger
 from scheduler import get_linear_schedule_with_warmup
@@ -65,8 +65,6 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
     # Calculate DDP size (number of data parallel replicas)
     ddp_size = dist_config.world_size // args.cp_size
 
-    
-
     # Create a device mesh for DDP and CP.
     # The mesh is organized as [CP_dimension, DDP_dimension] where:
     # - DDP dimension: number of data parallel replicas (world_size // cp_size)
@@ -97,7 +95,9 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
     )
 
     # Create an empty ESM-2 model with a masked language model head, e.g. "nvidia/esm2_t6_8M_UR50D".
-    config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, token_dropout=False, dtype=torch.bfloat16)
+    config = AutoConfig.from_pretrained(
+        args.model_tag, trust_remote_code=True, token_dropout=False, dtype=torch.bfloat16
+    )
     # If we're using sequence packing with TE layers, we need to pass the `attn_input_format` argument.
     if args.use_sequence_packing:
         config.attn_input_format = "thd"
@@ -136,7 +136,6 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
         for module in model.modules():
             if hasattr(module, "reset_parameters"):
                 module.reset_parameters()
-    
 
     # Context Parallelism requires THD Sequence Packing.
     assert args.use_sequence_packing, "Context Parallelism requires THD Sequence Packing."
@@ -148,7 +147,6 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
         cp_rank=cp_rank,
         **args.dataset,
     )
-    
 
     if args.use_torch_compile:
         # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.