NVIDIA
diff --git a/‎bionemo-recipes/models/esm2/tests/test_cp.py‎
Lines changed: 387 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/test_cp.py‎
Lines changed: 387 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/esm2_native_te/README.md‎
Lines changed: 22 additions & 1 deletion b/‎bionemo-recipes/recipes/esm2_native_te/README.md‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎bionemo-recipes/recipes/esm2_native_te/collator.py‎
Lines changed: 163 additions & 1 deletion b/‎bionemo-recipes/recipes/esm2_native_te/collator.py‎
Lines changed: 163 additions & 1 deletion
@@ -17,7 +17,7 @@ bionemo-framework repository. You can download a zipped directory of this folder
 
 | Model                                     | BF16 | FP8<sup>[1]</sup> | THD Input Format | FP8 with THD Input Format | MXFP8<sup>[2]</sup> | Context Parallelism |
 | ----------------------------------------- | ---- | ----------------- | ---------------- | ------------------------- | ------------------- | ------------------- |
-| [ESM-2](../../models/esm2/README.md)      | ✅   | ✅                | ✅               | ✅                        | ✅                  | 🚧                  |
+| [ESM-2](../../models/esm2/README.md)      | ✅   | ✅                | ✅               | ✅                        | ✅                  | ✅                  |
 | [AMPLIFY](../../models/amplify/README.md) | ✅   | ❌                | 🚧               | ❌                        | ❌                  | 🚧                  |
 
 ✅: Supported <br/>
@@ -88,6 +88,27 @@ python train_fsdp2.py --config-name L0_sanity \
   use_sequence_packing=true
 ```
 
+### Context Parallelism
+We provide a training script [train_ddp_cp](./esm2_native_te/train_ddp_cp.py) and a sample config [L0_sanity_cp](./hydra_config/L0_sanity_cp.yaml) that uses context parallelism. 
+
+In the config the argument `--cp_size` allows the user to set the size of the context parallel distributed group. When paired with Distributed Data Parallelism (DDP), the number of context parallel groups will be determined by `world_size//cp_size`. 
+
+Thus, for example, if a user has 8 processes and sets `cp_size=2` they will have `2` CP groups and `4` DDP groups. During dataloading we make no assumptions about the data pipeline being deterministic or not. We simply unique data only for the DDP groups and select the relevant CP shards for the respective CP group.
+
+For example, let's say that we have 2 DDP groups and 2 CP groups. Each DDP group will have a unique dataloader DP0 for DDP group 0
+and DP1 for DDP group 1. CP works by running something called ring attention, which expects tokens to live on each device in a particular layout. For this CP implementation we use something called [Dual Chunk Swapping](https://github.com/NVIDIA/TransformerEngine/blob/1df4a69f761672f633d40ea3605327087d1ea737/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py#L3714-L3770). If DP0 outputs sequence `1 2 3 4 5 6 7 8` and DP1 outputs `9 10 11 12 13 14 15 16` then when we run through the `CPAwareDataloader` defined in [datasets](./dataset.py), the dataloader will create CP shards from that DP group as follows:
+
+```
+      |   DP0   |    DP1        |
+  CP0 | 1,2,7,8 | 9, 10, 15, 16 |
+  CP1 | 3,4,5,6 | 11, 12, 13, 14|
+```
+You may notice these shards and wonder why they are the way they are. We did. The reason is that CP groups are sharded using slices. The full input sequence (such as `1 2 3 4 5 6 7`) is sliced into `2 * cp_size` groups. Then CP0 takes the first and last slice, while CP1 takes the middle slices, of each sequence. 
+
+In this example we only show one sequence but its important to note that slicing takes place on every sequence, so if a second sequence is also available, that will be sliced in the same manner. CP0 will take the first and last slice of every sequence, while CP1 will take the middle slices of each sequence.
+
+
+
 ### Comparing Against the HF Transformers Reference Implementation
 
 To launch training with the ESM-2 model as implemented in HF Transformers, pass a `facebook/esm2` checkpoint as the
 
@@ -20,12 +20,14 @@
 
 import logging
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Callable, Optional
 
 import datasets
 import torch
 from transformers import DataCollatorForLanguageModeling, DefaultDataCollator, PreTrainedTokenizerBase
 
+from transformer_engine.pytorch.attention.dot_product_attention.context_parallel import pad_thd_sequences_for_cp
+
 
 logger = logging.getLogger(__name__)
 
@@ -38,6 +40,7 @@ class MLMDataCollatorWithFlattening:
     2. Then applying MLM masking to the flattened sequence
     3. Providing Flash Attention metadata (cu_seq_lens) for sequence boundary awareness
     4. Optionally padding the total sequence length to be divisible by a specified number
+    5. Optionally, pad each sequence to be divisible by a specified number (if provided).
 
     The result is a THD-format batch optimized for Flash Attention with sequence packing,
     eliminating the need for traditional attention masks while maintaining proper sequence
@@ -62,6 +65,9 @@ class MLMDataCollatorWithFlattening:
         seed (int | None): Random seed for reproducible masking. Defaults to None.
         pad_to_multiple_of (int | None): If set, pads the total sequence length to be divisible
             by this number by adding a mock sequence at the end. Defaults to None.
+        pad_sequences_to_be_divisible_by (int | None): If set, pads each sequence to be divisible
+            by this number by adding padding tokens and labels set to -100. Defaults to None.
+            This is used by context parallelism.
 
     Example:
         >>> from transformers import AutoTokenizer
@@ -111,6 +117,7 @@ def __init__(
         return_position_ids: bool = False,
         bshd_equivalent: bool = False,
         bshd_pad_to_multiple_of: int | None = None,
+        pad_sequences_to_be_divisible_by: int | None = None,
     ):
         """Initialize the MLMDataCollatorWithFlattening.
 
@@ -129,6 +136,9 @@ def __init__(
                 collator, at the expense of additional computation time. Defaults to False.
             bshd_pad_to_multiple_of (int | None): For the bshd_equivalent mode, mimics padding that would be done by the
                 BSHD collator. Defaults to None.
+            pad_sequences_to_be_divisible_by (int | None): If set, pads each sequence to be divisible
+                by this number by adding padding tokens and labels set to -100. Defaults to None.
+                This is used by context parallelism.
         """
         self.mlm_collator = DataCollatorForLanguageModeling(
             tokenizer=tokenizer,
@@ -145,6 +155,10 @@ def __init__(
         self.return_position_ids = return_position_ids
         self.bshd_equivalent = bshd_equivalent
         self.bshd_pad_to_multiple_of = bshd_pad_to_multiple_of
+        self.pad_sequences_to_be_divisible_by = pad_sequences_to_be_divisible_by
+
+        if self.pad_sequences_to_be_divisible_by is not None and self.pad_to_multiple_of is not None:
+            raise ValueError("pad_sequences_to_be_divisible_by and pad_to_multiple_of cannot be used together")
 
         if bshd_pad_to_multiple_of is not None and not bshd_equivalent:
             raise ValueError("bshd_pad_to_multiple_of can only be used when bshd_equivalent is True")
@@ -227,6 +241,23 @@ def __call__(self, features, return_tensors=None):
         if self.pad_to_multiple_of is not None:
             batch = self._pad_batch_to_multiple_of(batch)
 
+        elif self.pad_sequences_to_be_divisible_by is not None:
+            # import pdb; pdb.set_trace()
+            input_ids_padded, labels_padded, cu_seqlens_padded = pad_thd_sequences_for_cp(
+                batch["input_ids"],
+                batch["labels"],
+                batch["cu_seq_lens_q"],
+                self.pad_sequences_to_be_divisible_by,
+                padding_token_id=int(
+                    self.mlm_collator.tokenizer.pad_token_id
+                ),
+                padding_label_id=-100,
+            )
+            batch["input_ids"] = input_ids_padded.unsqueeze(0)
+            batch["labels"] = labels_padded.unsqueeze(0)
+            batch["cu_seq_lens_q_padded"] = cu_seqlens_padded.to(torch.int32)
+            batch["cu_seq_lens_k_padded"] = cu_seqlens_padded.to(torch.int32)
+
         return batch
 
     def bshd_compatible_call(self, features, return_tensors=None):
@@ -269,6 +300,36 @@ def _pad_batch_to_multiple_of(self, batch):
         )
 
 
+class MLMDataCollatorWithFlatteningCPAware:
+    """A collator that is aware of context parallelism."""
+    def __init__(self, collator: MLMDataCollatorWithFlattening, cp_world_size: int):
+        self.collator = collator
+        self.cp_world_size = cp_world_size
+
+    def __call__(self, features):
+        batch = self.collator(features)
+
+        combined_batch = []
+        for cp_rank in range(self.cp_world_size):
+            input_ids_sharded, labels_sharded = split_batch_by_cp_rank(
+                cu_seqlens_padded=batch["cu_seq_lens_q_padded"],
+                input_ids_padded=batch["input_ids"],
+                labels_padded=batch["labels"],
+                qvk_format="thd",
+                cp_rank=cp_rank,
+                cp_world_size=self.cp_world_size,
+            )
+            batch_shard = dict(batch)
+            batch_shard["input_ids"] = input_ids_sharded
+            batch_shard["labels"] = labels_sharded
+            # Now determine the max length of the sequence.
+            seqlens_q = batch_shard["cu_seq_lens_q_padded"][1:] - batch_shard["cu_seq_lens_q_padded"][:-1]
+            batch_shard["max_length_q"] = int((seqlens_q.max().item() + 63) // 64 * 64) # TODO(@jomitchell): Not sure if I need this anymore.
+            batch_shard["max_length_k"] = batch_shard["max_length_q"]
+            combined_batch.append(batch_shard)
+
+        return combined_batch # [<cp_rank_0_shard>, <cp_rank_1_shard>, ..., <cp_rank_n_shard>]
+
 @dataclass
 class DataCollatorWithFlattening(DefaultDataCollator):
     """Data collator for sequence packing with flash attentions cu_seqlens-style attention.
@@ -441,3 +502,104 @@ def _pt_pad_to_multiple_of(batch: dict[str, Any], pad_to_multiple_of: int, token
         )
 
     return batch
+
+# TODO(@jomitchell): Once this gets merged: https://github.com/NVIDIA/TransformerEngine/pull/2387 
+# we can replace this with the one in TransformerEngine.
+def split_batch_by_cp_rank(
+    cu_seqlens_padded: torch.Tensor,
+    input_ids_padded: torch.Tensor,
+    labels_padded: torch.Tensor,
+    cp_group: torch.distributed.ProcessGroup = None,
+    qvk_format: str = "thd",
+    cp_rank: Optional[int] = None,
+    cp_world_size: Optional[int] = None,
+):
+    """Slice batch input along sequence dimension into multiple chunks for THD format.
+    This function is inteded for use in self attention. It will not work for cross attention because
+    it does not handle the case where the sequence length of the query and key are different.
+    Which are parallelized across GPUs in a context parallel group.
+    This version works with variable-length sequences using cumulative sequence lengths.
+
+    Args:
+        cp_rank: Optional manual CP rank index. When provided, the function shards tensors as if it
+            were executing on that rank without querying `torch.distributed.get_rank`.
+    """
+    if qvk_format not in ["thd", "bshd", "sbhd"]:
+        raise ValueError(f"Unsupported qvk_format: {qvk_format}!")
+    if qvk_format == "thd":
+        # Get context parallel size and rank
+        if cp_world_size > 1:
+            if cp_rank is None:
+                cp_rank = torch.distributed.get_rank(group=cp_group)
+            elif not (0 <= cp_rank < cp_world_size):
+                raise ValueError(f"cp_rank must be in [0, {cp_world_size}), but received {cp_rank}.")
+
+            # Calculate the chunk sizes for each sequence
+            total_slices_of_any_sequence = 2 * cp_world_size
+            slice_sizes = (
+                cu_seqlens_padded[1:] - cu_seqlens_padded[:-1]
+            ) // total_slices_of_any_sequence
+
+            # Process each tensor directly instead of using keys_to_change loop
+            def process_tensor(val):
+                if val is None:
+                    return val
+                # Determine which dimension is the sequence dimension
+                # Ensure cu_seqlens_padded[-1] is a Python int, not a 0-dim tensor
+                if isinstance(cu_seqlens_padded[-1], torch.Tensor):
+                    seq_len_val = cu_seqlens_padded[-1].item()
+                else:
+                    seq_len_val = cu_seqlens_padded[-1]
+
+                # Handle 1D tensors (like position_ids that don't have batch dimension)
+                if val.ndim == 1:
+                    if val.shape[0] == seq_len_val:
+                        current_seq_dim = 0
+                    else:
+                        raise ValueError(
+                            "1D tensor shape doesn't match expected sequence length. Make sure the"
+                            " inputs are in THD format and padded correctly."
+                        )
+                elif val.ndim >= 2:
+                    if val.shape[1] == seq_len_val:
+                        current_seq_dim = 1
+                    elif val.shape[0] == seq_len_val:
+                        current_seq_dim = 0
+                    else:
+                        raise ValueError(
+                            "Make sure the inputs are in THD format and padded correctly."
+                        )
+                else:
+                    raise ValueError("Tensor must be at least 1D")
+
+                # On this particular rank, for each sequence, get two slices, one from the beginning
+                # and one from the end.
+                cp_rank_slices = []
+                for slice_size, seq_start in zip(slice_sizes, cu_seqlens_padded[:-1]):
+                    # 1st segment
+                    cp_rank_slices.append(
+                        torch.arange(
+                            seq_start + (cp_rank * slice_size),
+                            seq_start + ((cp_rank + 1) * slice_size),
+                            device=val.device,
+                        )
+                    )
+
+                    # 2nd segment
+                    cp_rank_slices.append(
+                        torch.arange(
+                            seq_start + ((total_slices_of_any_sequence - cp_rank - 1) * slice_size),
+                            seq_start + ((total_slices_of_any_sequence - cp_rank) * slice_size),
+                            device=val.device,
+                        )
+                    )
+
+                return val.index_select(current_seq_dim, torch.cat(cp_rank_slices))
+
+            # Process each tensor directly
+            input_ids_padded = process_tensor(input_ids_padded)
+            labels_padded = process_tensor(labels_padded)
+    else:
+        raise ValueError(f"Support not implemented yet for qvk_format: {qvk_format}!")
+
+    return input_ids_padded, labels_padded