Fix fp8 inference with sequence parallelism

santhnm2 · santhnm2 · commit 17ae12ce2369 · 2025-10-15T22:56:15.000-07:00
diff --git a/megatron/core/fp8_utils.py b/megatron/core/fp8_utils.py
@@ -10,6 +10,12 @@
 import torch
 
 from megatron.core.enums import Fp8Recipe
+from megatron.core.tensor_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    gather_from_sequence_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import get_te_version, is_te_min_version
 
@@ -112,6 +118,27 @@ def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int:
         return 16
 
 
+def is_column_parallel_linear(module):
+    """Returns whether the given module is a ColumnParallelLinear layer."""
+    if HAVE_TE and (
+        isinstance(module, TEColumnParallelLinear)
+        or isinstance(module, TELayerNormColumnParallelLinear)
+    ):
+        return True
+    elif isinstance(module, ColumnParallelLinear):
+        return True
+    return False
+
+
+def is_row_parallel_linear(module):
+    """Returns whether the given module is a RowParallelLinear layer."""
+    if HAVE_TE and isinstance(module, TERowParallelLinear):
+        return True
+    elif isinstance(module, RowParallelLinear):
+        return True
+    return False
+
+
 """
 The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into
 several functions. It provides different implementations for each function based on different
@@ -587,6 +614,18 @@ def padded_forward(input_tensor, *args, **kwargs):
             if not FP8GlobalStateManager.is_fp8_enabled():
                 return original_forward(input_tensor, *args, **kwargs)
 
+            # With sequence parallelism we need to all-gather before padding
+            # and reduce-scatter after unpadding
+            if is_sequence_parallel := getattr(module, "sequence_parallel", False):
+                if is_column_parallel_linear(module):
+                    input_tensor = gather_from_sequence_parallel_region(
+                        input_tensor, group=module.tp_group
+                    )
+
+                # Disable sequence parallelism on the module because we are handling the
+                # all-gather and reduce-scatter externally
+                module.sequence_parallel = False
+
             seq_len, batch_size, hidden_size = input_tensor.shape
             # Reshape to (S, B*H) to pad sequence dimension
             input_2d = input_tensor.reshape(seq_len, -1)
@@ -612,6 +651,16 @@ def padded_forward(input_tensor, *args, **kwargs):
             unpadded_output_2d = _unpad_func(output_2d, [seq_len])
             unpadded_output = unpadded_output_2d.reshape(seq_len, batch_size, output_hidden_size)
 
+            if is_sequence_parallel:
+                # Reduce-scatter after unpadding
+                if is_row_parallel_linear(module):
+                    unpadded_output = reduce_scatter_to_sequence_parallel_region(
+                        unpadded_output, group=module.tp_group
+                    )
+
+                # Reset sequence parallelism flag on the module
+                module.sequence_parallel = True
+
             if other_outputs:
                 return (unpadded_output,) + other_outputs
             else:
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 from tqdm import tqdm
+from transformer_engine.pytorch.fp8 import check_fp8_support
 
 from megatron.core import parallel_state
 from megatron.core.inference.contexts.dynamic_context import (
@@ -31,7 +32,10 @@
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord
@@ -89,6 +93,8 @@ class DynamicEngineTestConfig:
     # relevant to the test. The tests only check if the required
     # context attributes are set correctly.
 
+    fp8: bool = False
+
     def __post_init__(self):
 
         # Compute max_sequence_length.
@@ -236,7 +242,7 @@ def _build_test_env(cls, test_config):
         transformer_config = TransformerConfig(
             params_dtype=torch.bfloat16,
             num_layers=4,
-            hidden_size=32,
+            hidden_size=128 if test_config.fp8 else 32,
             num_attention_heads=4,
             use_cpu_initialization=True,
             cuda_graph_impl=(
@@ -259,14 +265,21 @@ def _build_test_env(cls, test_config):
             inference_sampling_seed=test_config.random_seed,
             cuda_graph_scope=test_config.cuda_graph_scope,
         )
+        if test_config.fp8:
+            transformer_config.fp8 = "hybrid"
+            transformer_config.fp8_recipe = "tensorwise"
+            # transformer_config.fp8_param = True
+            layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        else:
+            layer_spec = get_gpt_layer_local_spec()
 
         # Requests.
         requests = cls._build_requests(test_config)
 
         # GPT model.
         model = GPTModel(
             config=transformer_config,
-            transformer_layer_spec=get_gpt_layer_local_spec(),
+            transformer_layer_spec=layer_spec,
             vocab_size=test_config.vocab_size,
             max_sequence_length=test_config.max_sequence_length,
             parallel_output=True,
@@ -286,6 +299,7 @@ def _build_test_env(cls, test_config):
             fp32_residual_connection=False,
             params_dtype=transformer_config.params_dtype,
             padded_vocab_size=test_config.vocab_size,
+            fp8="hybrid" if test_config.fp8 else None,
         )
 
         # Inference context.
@@ -799,6 +813,25 @@ def test_parallel_inference(
             materialize_only_last_token_logits=materialize_only_last_token_logits,
         )
 
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
+    )
+    @pytest.mark.parametrize("materialize_only_last_token_logits", [False, True])
+    def test_sequence_parallel_fp8_inference(self, materialize_only_last_token_logits: bool):
+        fp8_available, reason_for_no_fp8 = check_fp8_support()
+        if not fp8_available:
+            pytest.skip(reason_for_no_fp8)
+
+        self._run_test(
+            min_prompt_length=19,
+            max_prompt_length=19,
+            tensor_model_parallel_size=4,
+            sequence_parallel=True,
+            materialize_only_last_token_logits=True,
+            fp8=True,
+        )
+
     @pytest.mark.internal
     @pytest.mark.skipif(
         not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"