Modify the alpha in cutedsl to be a device pointer and refactor code

Wong4j · Wong4j · commit ca03bfefac45 · 2025-11-19T03:02:38.000Z
Signed-off-by: Shijie Wang &lt;jaywan@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -6,8 +6,8 @@
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.math_utils import pad_up
 
-from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
-                         OptimizationProfile, TunableRunner, TuningConfig)
+from ..autotuner import (ConstraintSpec, DynamicTensorSpec, OptimizationProfile,
+                         TunableRunner, TuningConfig)
 from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE
 from ..utils import (fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
@@ -38,16 +38,21 @@ class CuteDSLNVFP4BlackwellLinear(TunableRunner):
             constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ),
         )
 
-        def __init__(self, alpha: float, output_dtype: torch.dtype):
+        def __init__(self, output_dtype: torch.dtype):
             super().__init__()
-            self.alpha = alpha
+
+            # Validate output dtype (use proper exception instead of assert)
+            if output_dtype != torch.bfloat16:
+                raise ValueError(
+                    f"CuteDSL NVFP4 only supports bfloat16 output, got {output_dtype}"
+                )
             self.output_dtype = output_dtype
-            assert output_dtype == torch.bfloat16
 
+            # Validate SM version at initialization
             if get_sm_version() != 100:
                 raise ValueError(
-                    f"SM version {get_sm_version()} is not supported for CuteDSLNVFP4BlackwellLinear, it only supports SM 100"
-                )
+                    f"SM version {get_sm_version()} is not supported. "
+                    f"CuteDSL NVFP4 requires SM 100 (Blackwell).")
 
         # rewrite the hash function because the value of self.alpha doesn't affect the tactic.
         def __hash__(self):
@@ -147,6 +152,7 @@ def forward(
             self,
             inputs: List[torch.Tensor],
             tactic,
+            **kwargs,
         ) -> torch.Tensor:
             """
             Performs fp8 blockwise gemm operation using CuTe DSL.
@@ -158,7 +164,6 @@ def forward(
                     inputs[2]: Input scale tensor of shape (k//16, m), dtype: fp8.
                     inputs[3]: Weight scale tensor of shape (n, k//16), dtype: fp8.
                     inputs[4]: Alpha scaling factor. dtype: float32.
-                    inputs[5]: Output dtype, expected to be torch.bfloat16.
                 tactic: Tiling and cluster strategy, typically a tuple (mma_tiler_mn, cluster_shape_mn).
 
             Returns:
@@ -176,7 +181,7 @@ def forward(
                     False,
                 ]
 
-            a_tensor, b_tensor, a_sf_tensor, b_sf_tensor = inputs
+            a_tensor, b_tensor, a_sf_tensor, b_sf_tensor, alpha_tensor = inputs
             m, k, n = a_tensor.shape[0], a_tensor.shape[1], b_tensor.shape[0]
             c_tensor = torch.empty(*(m, n),
                                    dtype=self.output_dtype,
@@ -204,6 +209,9 @@ def forward(
                 b_sf_tensor, cutlass.Float8E4M3FN, 16)
             c_ptr = self.make_cute_dsl_global_pointer(c_tensor,
                                                       cutlass.BFloat16, 16)
+            # Create pointer to alpha on device
+            alpha_ptr = self.make_cute_dsl_global_pointer(
+                alpha_tensor, cutlass.Float32, 4)
 
             # get stream
             torch_stream = torch.cuda.current_stream()
@@ -260,7 +268,7 @@ def forward(
                     kernel_a_sf_ptr,
                     kernel_b_sf_ptr,
                     c_ptr,
-                    self.alpha,
+                    alpha_ptr,  # Pass alpha as device pointer
                     max_active_clusters,
                     stream,
                     swap_ab,
@@ -285,7 +293,7 @@ def forward(
                 kernel_a_sf_ptr,
                 kernel_b_sf_ptr,
                 c_ptr,
-                self.alpha,
+                alpha_ptr,  # Pass alpha as device pointer
                 stream,
             )
 
@@ -302,34 +310,31 @@ def cute_dsl_nvfp4_gemm_blackwell(
         weight: torch.Tensor,
         input_scale: torch.Tensor,
         weight_scale: torch.Tensor,
-        alpha: float,
+        alpha: torch.Tensor,
         output_dtype: torch.dtype,
     ) -> torch.Tensor:
         """CuteDSL-based NVFP4 GEMM optimized for Blackwell.
 
-        .. deprecated::
-            Use :func:`torch.ops.trtllm.nvfp4_gemm_unified` instead for automatic
-            backend selection among CUTLASS, cuBLASLt, and CuteDSL based on
-            performance profiling.
+        Note:
+            This function is primarily used internally by nvfp4_gemm_unified.
+            Direct usage is discouraged. Consider using nvfp4_gemm_unified instead
+            for automatic backend selection with better performance.
         """
-        from tensorrt_llm.logger import logger
-        logger.warning_once(
-            "cute_dsl_nvfp4_gemm_blackwell is deprecated. Use nvfp4_gemm_unified instead "
-            "for automatic backend selection with better performance.",
-            key="cute_dsl_nvfp4_gemm_blackwell_deprecated")
+        from tensorrt_llm._torch.autotuner import AutoTuner
 
         tuner = AutoTuner.get()
 
-        cute_dsl_nvfp4_gemm_blackwell_runner = CuteDSLNVFP4BlackwellLinear(
-            alpha, output_dtype)
+        runner = CuteDSLNVFP4BlackwellLinear(output_dtype)
+
         _, best_tactic = tuner.choose_one(
             "trtllm::cute_dsl_nvfp4_gemm_blackwell",
-            [cute_dsl_nvfp4_gemm_blackwell_runner],
+            [runner],
             CuteDSLNVFP4BlackwellLinear.tuning_config,
             [input, weight, input_scale, weight_scale],
         )
-        return cute_dsl_nvfp4_gemm_blackwell_runner(
-            inputs=[input, weight, input_scale, weight_scale],
+
+        return runner(
+            inputs=[input, weight, input_scale, weight_scale, alpha],
             tactic=best_tactic,
         )
 
@@ -339,7 +344,7 @@ def _(
         mat_b: torch.Tensor,
         input_scale: torch.Tensor,
         weight_scale: torch.Tensor,
-        alpha: float,
+        alpha: torch.Tensor,  # Match custom op signature
         output_dtype: torch.dtype,
     ):
         # [m, k]
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -560,14 +560,11 @@ def nvfp4_gemm_cublaslt(
 ) -> torch.Tensor:
     """cuBLASLt-based NVFP4 GEMM with heuristic-based auto-tuning.
 
-    .. deprecated::
-        Use :func:`nvfp4_gemm_unified` instead for automatic backend selection
-        among CUTLASS, cuBLASLt, and CuteDSL based on performance profiling.
+    Note:
+        This function is primarily used internally by nvfp4_gemm_unified.
+        Direct usage is discouraged. Consider using nvfp4_gemm_unified instead
+        for automatic backend selection with better performance.
     """
-    logger.warning_once(
-        "nvfp4_gemm_cublaslt is deprecated. Use nvfp4_gemm_unified instead "
-        "for automatic backend selection with better performance.",
-        key="nvfp4_gemm_cublaslt_deprecated")
     tuner = AutoTuner.get()
 
     # Use CublasLt runner with heuristic-based tuning
@@ -616,14 +613,11 @@ def nvfp4_gemm(
 ) -> torch.Tensor:
     """CUTLASS-based NVFP4 GEMM with auto-tuning.
 
-    .. deprecated::
-        Use :func:`nvfp4_gemm_unified` instead for automatic backend selection
-        among CUTLASS, cuBLASLt, and CuteDSL based on performance profiling.
+    Note:
+        This function is primarily used internally by nvfp4_gemm_unified.
+        Direct usage is discouraged. Consider using nvfp4_gemm_unified instead
+        for automatic backend selection with better performance.
     """
-    logger.warning_once(
-        "nvfp4_gemm is deprecated. Use nvfp4_gemm_unified instead "
-        "for automatic backend selection with better performance.",
-        key="nvfp4_gemm_deprecated")
     tuner = AutoTuner.get()
 
     # Use Cutlass runner with predefined configs
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py
@@ -300,7 +300,7 @@ def __call__(
         sfa_tensor: cute.Tensor,
         sfb_tensor: cute.Tensor,
         c_tensor: cute.Tensor,
-        alpha: cutlass.Float32,
+        alpha: cute.Pointer,  # Changed from cutlass.Float32 to device pointer
         max_active_clusters: cutlass.Constexpr,
         stream: cuda.CUstream,
         epilogue_op: cutlass.Constexpr = lambda x: x,
@@ -548,34 +548,37 @@ class SharedStorage:
     # GPU device kernel
     @cute.kernel
     def kernel(
-        self,
-        tiled_mma: cute.TiledMma,
-        tiled_mma_sfb: cute.TiledMma,
-        tma_atom_a: cute.CopyAtom,
-        mA_mkl: cute.Tensor,
-        tma_atom_b: cute.CopyAtom,
-        mB_nkl: cute.Tensor,
-        tma_atom_sfa: cute.CopyAtom,
-        mSFA_mkl: cute.Tensor,
-        tma_atom_sfb: cute.CopyAtom,
-        mSFB_nkl: cute.Tensor,
-        tma_atom_c: Optional[cute.CopyAtom],
-        mC_mnl: cute.Tensor,
-        cluster_layout_vmnk: cute.Layout,
-        cluster_layout_sfb_vmnk: cute.Layout,
-        a_smem_layout_staged: cute.ComposedLayout,
-        b_smem_layout_staged: cute.ComposedLayout,
-        sfa_smem_layout_staged: cute.Layout,
-        sfb_smem_layout_staged: cute.Layout,
-        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
-        epi_tile: cute.Tile,
-        tile_sched_params: utils.PersistentTileSchedulerParams,
-        epilogue_op: cutlass.Constexpr,
-        alpha: cutlass.Float32,
+            self,
+            tiled_mma: cute.TiledMma,
+            tiled_mma_sfb: cute.TiledMma,
+            tma_atom_a: cute.CopyAtom,
+            mA_mkl: cute.Tensor,
+            tma_atom_b: cute.CopyAtom,
+            mB_nkl: cute.Tensor,
+            tma_atom_sfa: cute.CopyAtom,
+            mSFA_mkl: cute.Tensor,
+            tma_atom_sfb: cute.CopyAtom,
+            mSFB_nkl: cute.Tensor,
+            tma_atom_c: Optional[cute.CopyAtom],
+            mC_mnl: cute.Tensor,
+            cluster_layout_vmnk: cute.Layout,
+            cluster_layout_sfb_vmnk: cute.Layout,
+            a_smem_layout_staged: cute.ComposedLayout,
+            b_smem_layout_staged: cute.ComposedLayout,
+            sfa_smem_layout_staged: cute.Layout,
+            sfb_smem_layout_staged: cute.Layout,
+            c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+            epi_tile: cute.Tile,
+            tile_sched_params: utils.PersistentTileSchedulerParams,
+            epilogue_op: cutlass.Constexpr,
+            alpha: cute.
+        Pointer,  # Changed from cutlass.Float32 to device pointer
     ):
         """
         GPU device kernel performing the Persistent batched GEMM computation.
         """
+        alpha_value = alpha.load().to(self.c_dtype)
+
         warp_idx = cute.arch.warp_idx()
         warp_idx = cute.arch.make_warp_uniform(warp_idx)
 
@@ -1248,6 +1251,7 @@ def kernel(
                 #
                 subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
                 num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+
                 for subtile_idx in cutlass.range(subtile_cnt):
                     #
                     # Load accumulator from tensor memory buffer to register
@@ -1259,8 +1263,8 @@ def kernel(
                     # Convert to C type
                     #
                     acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
-                    acc_vec = epilogue_op(
-                        alpha.to(self.c_dtype) * acc_vec.to(self.c_dtype))
+                    acc_vec = epilogue_op(alpha_value *
+                                          acc_vec.to(self.c_dtype))
                     tRS_rC.store(acc_vec)
 
                     #
@@ -1940,7 +1944,7 @@ def __call__(
         a_sf_ptr: cute.Pointer,
         b_sf_ptr: cute.Pointer,
         c_ptr: cute.Pointer,
-        alpha: cutlass.Float32,
+        alpha: cute.Pointer,  # Changed from cutlass.Float32 to device pointer
         max_active_clusters: cutlass.Constexpr,
         current_stream: cuda.CUstream,
         swap_ab: cutlass.Constexpr = False,
@@ -1961,7 +1965,7 @@ def __call__(
             a_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for A.
             b_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for B.
             c_ptr (cute.Pointer): Pointer to the C tensor.
-            alpha (cutlass.Float32): Scaling factor for the GEMM output.
+            alpha (cute.Pointer): Pointer to alpha scaling factor on device (avoids CPU-GPU sync).
             max_active_clusters (cutlass.Constexpr): Maximum number of active
                 clusters.
             current_stream (cuda.CUstream): CUDA stream for the operation.