running pre-commit and fixing issues

pstjohn · pstjohn · commit 14b7ad5d41aa · 2025-11-17T07:04:12.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/create_tokenizer.py b/bionemo-recipes/models/llama3/create_tokenizer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 
@@ -97,7 +112,9 @@ def main():
     tokenizer = create_nucleotide_tokenizer()
 
     logger.info(f"Vocab size: {tokenizer.vocab_size}")
-    logger.info(f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}")
+    logger.info(
+        f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}"
+    )
 
     # Save to default location
     save_path = os.path.join(os.path.dirname(__file__), "nucleotide_fast_tokenizer")
diff --git a/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer/tokenizer.json b/bionemo-recipes/models/llama3/nucleotide_fast_tokenizer/tokenizer.json
@@ -393,4 +393,4 @@
     },
     "unk_token": "<UNK>"
   }
-}
+}
diff --git a/bionemo-recipes/models/llama3/tests/test_tokenizer.py b/bionemo-recipes/models/llama3/tests/test_tokenizer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 
@@ -31,19 +46,19 @@ def test_tokenizer_encode_simple_sequences(tokenizer):
     """Test encoding a simple repeated character sequences."""
     sequence = "AAAA"
     encoded = tokenizer.encode(sequence, add_special_tokens=True)
-    
+
     # Expected: BOS + AAAA + EOS = [2, 65, 65, 65, 65, 0]
     expected = [2, 65, 65, 65, 65, 0]
     assert encoded == expected
 
-    sequence = "C" 
+    sequence = "C"
     encoded = tokenizer.encode(sequence, add_special_tokens=True)
 
     # Expected: BOS + C + EOS = [2, 67, 0]
     expected = [2, 67, 0]
     assert encoded == expected
-    
-    sequence = "G" *20
+
+    sequence = "G" * 20
     encoded = tokenizer.encode(sequence, add_special_tokens=True)
     expected = [2] + [71] * 20 + [0]
     assert encoded == expected
@@ -53,7 +68,7 @@ def test_tokenizer_encode_without_special_tokens(tokenizer):
     """Test encoding without BOS/EOS tokens."""
     sequence = "TTTT"
     encoded = tokenizer.encode(sequence, add_special_tokens=False)
-    
+
     # Expected: just the Ts (T=84)
     expected = [84, 84, 84, 84]
     assert encoded == expected
@@ -64,7 +79,7 @@ def test_tokenizer_roundtrip_encode_decode(tokenizer):
     sequence = "ATCGATCG"
     encoded = tokenizer.encode(sequence, add_special_tokens=True)
     decoded = tokenizer.decode(encoded, skip_special_tokens=True)
-    
+
     # Decoded may have spaces between tokens, so compare without spaces
     assert sequence == decoded.replace(" ", "")
 
@@ -81,43 +96,43 @@ def test_tokenizer_nucleotide_mappings(tokenizer):
 def test_tokenizer_padding_to_longest(tokenizer):
     """Test padding pads to longest sequence in batch."""
     batch = tokenizer(["AAAA", "TTTTTTTT"], padding=True, add_special_tokens=True, return_tensors="pt")
-    
+
     # AAAA → [2, 65, 65, 65, 65, 0] = 6 tokens
     # TTTTTTTT → [2, 84, 84, 84, 84, 84, 84, 84, 84, 0] = 10 tokens
     # Should pad to 10
-    assert batch['input_ids'].shape == torch.Size([2, 10])
-    
+    assert batch["input_ids"].shape == torch.Size([2, 10])
+
     # First sequence should have padding (PAD=1)
-    assert batch['input_ids'][0, 6].item() == 1  # First padding position
-    assert batch['input_ids'][0, 9].item() == 1  # Last padding position
-    
+    assert batch["input_ids"][0, 6].item() == 1  # First padding position
+    assert batch["input_ids"][0, 9].item() == 1  # Last padding position
+
     # Attention mask: 1 for real tokens, 0 for padding
-    assert batch['attention_mask'][0, 5].item() == 1  # Last real token
-    assert batch['attention_mask'][0, 6].item() == 0  # First padding
+    assert batch["attention_mask"][0, 5].item() == 1  # Last real token
+    assert batch["attention_mask"][0, 6].item() == 0  # First padding
 
 
 def test_tokenizer_attention_mask_correct(tokenizer):
     """Test attention mask is 1 for real tokens, 0 for padding."""
     batch = tokenizer(["GG", "GGGGGG"], padding=True, add_special_tokens=True, return_tensors="pt")
-    
+
     # GG → 4 tokens (BOS + GG + EOS)
     # GGGGGG → 8 tokens (BOS + GGGGGG + EOS)
     # Padded to 8 tokens
-    
+
     # First sequence: 4 real + 4 padding
     expected_mask_0 = [1, 1, 1, 1, 0, 0, 0, 0]
-    assert batch['attention_mask'][0].tolist() == expected_mask_0
-    
+    assert batch["attention_mask"][0].tolist() == expected_mask_0
+
     # Second sequence: all real
     expected_mask_1 = [1, 1, 1, 1, 1, 1, 1, 1]
-    assert batch['attention_mask'][1].tolist() == expected_mask_1
+    assert batch["attention_mask"][1].tolist() == expected_mask_1
 
 
 def test_tokenizer_mixed_nucleotides(tokenizer):
     """Test all standard nucleotides encode correctly."""
     sequence = "ATCGGTC"
     encoded = tokenizer.encode(sequence, add_special_tokens=False)
-    
+
     # A=65, T=84, C=67, G=71
     # ATCGGTC = A, T, C, G, G, T, C
     expected = [65, 84, 67, 71, 71, 84, 67]
@@ -136,20 +151,20 @@ def test_tokenizer_special_nucleotides(tokenizer):
 
 def test_10kbp_sequence_creates_expected_window_count(tokenizer):
     """Test 10kbp sequence creates correct number of windows with seq_length=1000, stride=800.
-    
+
     Verifies windowing math: 10000bp with seq_length=1000, stride=800.
     """
     sequence = "A" * 10000  # 10kbp
-    
+
     result = tokenizer(
         sequence,
         max_length=1000,
-        stride=800,       # 800 token overlap
+        stride=800,  # 800 token overlap
         truncation=True,
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     # Hardcoded expectation based on input data:
     # 10000bp with 1000 token windows and 800 token stride
     # Step forward = 1000 - 800 = 200 tokens per window
@@ -159,7 +174,7 @@ def test_10kbp_sequence_creates_expected_window_count(tokenizer):
 def test_overlapping_windows_creates_more_samples(tokenizer):
     """Test overlapping stride creates more windows than less overlapping."""
     sequence = "ATCG" * 2500  # 10kbp
-    
+
     result_more_overlap = tokenizer(
         sequence,
         max_length=1000,
@@ -168,7 +183,7 @@ def test_overlapping_windows_creates_more_samples(tokenizer):
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     result_less_overlap = tokenizer(
         sequence,
         max_length=1000,
@@ -177,7 +192,7 @@ def test_overlapping_windows_creates_more_samples(tokenizer):
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     # Hardcoded expectations
     assert len(result_more_overlap["input_ids"]) == 47  # With more overlap (smaller step)
     assert len(result_less_overlap["input_ids"]) == 20  # With less overlap (larger step)
@@ -187,7 +202,7 @@ def test_overlapping_windows_creates_more_samples(tokenizer):
 def test_production_window_length_creates_expected_samples(tokenizer):
     """Test production settings (8192 window, 200 overlap) create correct number of windows."""
     sequence = "A" * 50000  # 50kbp sequence
-    
+
     result = tokenizer(
         sequence,
         max_length=8192,
@@ -196,7 +211,7 @@ def test_production_window_length_creates_expected_samples(tokenizer):
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     # Hardcoded expectation with production settings:
     # 50000bp with 8192 window and 200 stride (overlap)
     # Step forward = 8192 - 200 = 7992 tokens per window
@@ -206,7 +221,7 @@ def test_production_window_length_creates_expected_samples(tokenizer):
 def test_short_sequences_dont_overflow(tokenizer):
     """Test that short sequences (< max_length) don't create overflow windows."""
     sequence = "ATCG" * 100  # 400bp
-    
+
     result = tokenizer(
         sequence,
         max_length=1000,
@@ -215,7 +230,7 @@ def test_short_sequences_dont_overflow(tokenizer):
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     # Sequence is shorter than max_length, should only create 1 window
     assert len(result["input_ids"]) == 1
     # Length should be 400bp + BOS + EOS = 402 tokens
@@ -224,40 +239,38 @@ def test_short_sequences_dont_overflow(tokenizer):
 
 def test_bos_eos_in_overlapping_windows(tokenizer):
     """Test that BOS/EOS tokens are added to every overlapping window.
-    
+
     Verifies that when using return_overflowing_tokens with add_special_tokens=True,
     each window gets its own BOS and EOS tokens, treating each as an independent sequence.
     This matches the behavior needed for causal language modeling training.
     """
     # Use a short genomic sequence that will produce exactly 2 overlapping windows
     # With max_length=7 and stride=4, sequence of 8bp should give 2 windows
     sequence = "ATCGATCG"  # 8bp
-    
+
     result = tokenizer(
         sequence,
-        max_length=7,      # BOS + 5 content + EOS = 7 tokens total
-        stride=4,          # Overlap of 4 tokens between windows
+        max_length=7,  # BOS + 5 content + EOS = 7 tokens total
+        stride=4,  # Overlap of 4 tokens between windows
         truncation=True,
         return_overflowing_tokens=True,
         add_special_tokens=True,
     )
-    
+
     # Should produce exactly 2 windows
     num_windows = len(result["input_ids"])
     assert num_windows >= 2, f"Should produce at least 2 overlapping windows, got {num_windows}"
-    
+
     first_window = result["input_ids"][0]
     second_window = result["input_ids"][1]
-    
+
     # Verify both windows have BOS at start and EOS at end
     assert first_window[0] == tokenizer.bos_token_id
     assert first_window[-1] == tokenizer.eos_token_id
     assert second_window[0] == tokenizer.bos_token_id
     assert second_window[-1] == tokenizer.eos_token_id
-    
+
     # Verify windows are actually overlapping by checking they share some content
     first_content = set(first_window[1:-1])
     second_content = set(second_window[1:-1])
     assert len(first_content & second_content) > 0
-
-
diff --git a/bionemo-recipes/recipes/llama3/distributed_config.py b/bionemo-recipes/recipes/llama3/distributed_config.py
@@ -42,6 +42,3 @@ class DistributedConfig:
     def is_main_process(self) -> bool:
         """This is the global rank 0 process, to be used for wandb logging, etc."""
         return self.rank == 0
-
-
-
diff --git a/bionemo-recipes/recipes/llama3/tests/conftest.py b/bionemo-recipes/recipes/llama3/tests/conftest.py
@@ -39,16 +39,16 @@ def recipe_path() -> Path:
 @pytest.fixture(scope="session")
 def mock_genomic_parquet(tmp_path_factory) -> Path:
     """Create a mock genomic sequences parquet file for testing.
-    
+
     This fixture creates a small parquet file with synthetic genomic sequences
     that can be used for training tests without relying on external data files.
-    
+
     Returns:
         Path to the generated parquet file
     """
     tmp_dir = tmp_path_factory.mktemp("data")
     parquet_path = tmp_dir / "test_genomic_sequences.parquet"
-    
+
     # Create mock genomic sequences with simple repeating patterns
     # These are easy for the model to overfit to, which is perfect for sanity tests
     sequences = [
@@ -58,12 +58,14 @@ def mock_genomic_parquet(tmp_path_factory) -> Path:
         "ACGT" * 400,  # 1600 bp - all 4 nucleotides
         "TGCA" * 350,  # 1400 bp - reverse pattern
     ]
-    
+
     # Create parquet table with 'sequence' column
-    table = pa.table({
-        "sequence": sequences,
-    })
-    
+    table = pa.table(
+        {
+            "sequence": sequences,
+        }
+    )
+
     pq.write_table(table, parquet_path)
     return parquet_path
 
@@ -103,5 +105,3 @@ def device_mesh():
     _mesh_resources.mesh_dim_group_options.clear()
     torch.cuda.empty_cache()
     torch.cuda.synchronize()
-
-
diff --git a/bionemo-recipes/recipes/llama3/tests/test_dataset.py b/bionemo-recipes/recipes/llama3/tests/test_dataset.py

Original file line number	Diff line number	Diff line change
`@@ -393,4 +393,4 @@`
`393`	`393`	`},`
`394`	`394`	`"unk_token": "<UNK>"`
`395`	`395`	`}`
`396`		`-}`
	`396`	`+}`