refactor and fix bug

Wong4j · Wong4j · commit 32e7debff0d5 · 2025-11-21T06:28:00.000Z
Signed-off-by: Shijie Wang &lt;jaywan@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -3,6 +3,8 @@
 
 import torch
 
+from tensorrt_llm.logger import logger
+
 from ..._utils import get_sm_version
 from ...math_utils import pad_up
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
@@ -80,11 +82,48 @@ def get_valid_tactics(
             real_k = k * 2
             batch_size = 1
             sf_vec_size = 16
-            # m,k
+
+            # Fixed layout for FP4: A and B are always K-major
             a_major = "k"
-            # n, k
             b_major = "k"
 
+            # Data types
+            ab_dtype = cutlass.Float4E2M1FN
+            c_dtype = cutlass.BFloat16
+
+            # Early exit: Check K dimension alignment
+            # For K-major layout (A and B tensors), K is the major mode (contiguous dimension).
+            # 16-byte alignment requirement: K must be divisible by 32 for FP4 (128 bits / 4 bits = 32)
+            if real_k % 32 != 0:
+                logger.debug(
+                    f"CuteDSL: K={real_k} does not meet 16-byte alignment requirement "
+                    f"(K%32={real_k%32}, expected 0). Skipping all tactics.")
+                return []
+
+            # Optimize swap_ab candidates based on M and N alignment
+            # swap_ab=False → C is N-major → requires N%8==0 (BF16: 128 bits / 16 bits = 8)
+            # swap_ab=True  → C is M-major → requires M%8==0
+            m_aligned = (m % 8 == 0)
+            n_aligned = (n % 8 == 0)
+
+            if not m_aligned and not n_aligned:
+                logger.debug(
+                    f"CuteDSL: Neither M={m} nor N={n} meets 16-byte alignment "
+                    f"(M%8={m%8}, N%8={n%8}). No valid C layout. Skipping all tactics."
+                )
+                return []
+
+            # Only test swap_ab values that satisfy alignment
+            swap_ab_candidates = []
+            if n_aligned:
+                swap_ab_candidates.append(False)  # N-major layout
+            if m_aligned:
+                swap_ab_candidates.append(True)  # M-major layout
+
+            logger.debug(
+                f"CuteDSL: M={m}(aligned={m_aligned}), N={n}(aligned={n_aligned}), K={real_k}(aligned=True). "
+                f"Testing swap_ab={swap_ab_candidates}")
+
             # full shamoo
             mma_tiler_mn_candidates = [
                 (256, 128),
@@ -105,7 +144,6 @@ def get_valid_tactics(
                 (4, 2),
                 (4, 4),
             ]
-            swap_ab_candidates = [True, False]
 
             valid_tactics = []
             for swap_ab in swap_ab_candidates:
@@ -120,11 +158,12 @@ def get_valid_tactics(
                             kernel_m = m
                             kernel_n = n
 
+                        # Use can_implement to check all constraints
                         if Sm100BlockScaledPersistentDenseGemmKernel.can_implement(
-                                cutlass.Float4E2M1FN,  # ab_dtype,
+                                ab_dtype,
                                 cutlass.Float8E4M3FN,  # sf_dtype
-                                sf_vec_size,  # sf_vec_size,
-                                cutlass.BFloat16,  # c_dtype,
+                                sf_vec_size,
+                                c_dtype,
                                 mma_tiler_mn,
                                 cluster_shape_mn,
                                 kernel_m,
@@ -138,6 +177,9 @@ def get_valid_tactics(
                             valid_tactics.append(
                                 (mma_tiler_mn, cluster_shape_mn, swap_ab))
 
+            logger.debug(
+                f"CuteDSL: Found {len(valid_tactics)} valid tactics for M={m}, N={n}, K={real_k}"
+            )
             return valid_tactics
 
         def make_cute_dsl_global_pointer(self, tensor: torch.Tensor, dtype,
@@ -196,9 +238,27 @@ def forward(
             sf_k = pad_up(real_k // sf_vec_size, 4)
             sf_n = pad_up(n, 128)
 
-            # the scaling tensor is 1D. we need to make sure it has been padded to the correct shape
-            assert a_sf_tensor.shape == (sf_m * sf_k, )
-            assert b_sf_tensor.shape == (sf_n * sf_k, )
+            # Reshape scale factors to CuteDSL's expected format
+            # Input format (from CUTLASS/cuBLASLt): (m*k//16,) and (n*k//16,)
+            # CuteDSL format: (sf_m*sf_k,) and (sf_n*sf_k,)
+            # Note: This is just a view change, no memory copy
+            expected_a_sf_size = sf_m * sf_k
+            expected_b_sf_size = sf_n * sf_k
+
+            if a_sf_tensor.numel() != expected_a_sf_size:
+                raise ValueError(
+                    f"CuteDSL: act scale factor size mismatch. "
+                    f"Expected {expected_a_sf_size} (sf_m={sf_m} * sf_k={sf_k}), "
+                    f"got {a_sf_tensor.numel()} for shape M={m}, K={real_k}")
+            if b_sf_tensor.numel() != expected_b_sf_size:
+                raise ValueError(
+                    f"CuteDSL: weight scale factor size mismatch. "
+                    f"Expected {expected_b_sf_size} (sf_n={sf_n} * sf_k={sf_k}), "
+                    f"got {b_sf_tensor.numel()} for shape N={n}, K={real_k}")
+
+            # Reshape to CuteDSL's expected format (just a view, no copy)
+            a_sf_tensor = a_sf_tensor.reshape(sf_m * sf_k)
+            b_sf_tensor = b_sf_tensor.reshape(sf_n * sf_k)
 
             a_ptr = self.make_cute_dsl_global_pointer(a_tensor,
                                                       cutlass.Float4E2M1FN, 32)
@@ -328,7 +388,7 @@ def cute_dsl_nvfp4_gemm_blackwell(
             "trtllm::cute_dsl_nvfp4_gemm_blackwell",
             [runner],
             CuteDSLNVFP4BlackwellLinear.tuning_config,
-            [input, weight, input_scale, weight_scale],
+            [input, weight, input_scale, weight_scale, alpha],
         )
 
         return runner(
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import List, Mapping, Optional, Tuple
+from typing import List, Mapping, Optional, Tuple, Union
 
 import torch
 import triton  # type: ignore[import]
@@ -707,7 +707,29 @@ def get_valid_tactics(self,
         # Add CuteDSL runner if available
         if backend in ["auto", "cutedsl"]:
             if IS_CUTLASS_DSL_AVAILABLE:
-                tactics.append("cutedsl")
+                # Check if CuteDSL actually supports the current shape
+                from tensorrt_llm._torch.custom_ops.cute_dsl_custom_ops import \
+                    CuteDSLNVFP4BlackwellLinear
+                cutedsl_runner = CuteDSLNVFP4BlackwellLinear(self.output_dtype)
+                cutedsl_tactics = cutedsl_runner.get_valid_tactics(
+                    inputs, profile)
+
+                if cutedsl_tactics:
+                    # CuteDSL supports this shape
+                    tactics.append("cutedsl")
+                elif backend == "cutedsl":
+                    # Explicitly requested CuteDSL but it doesn't support this shape
+                    m, n, k = inputs[0].shape[0], inputs[1].shape[
+                        0], inputs[0].shape[1] * 2
+                    raise ValueError(
+                        f"CuteDSL backend does not support the current shape:\n"
+                        f"  M={m}, N={n}, K={k}\n"
+                        f"CuteDSL requires 16-byte alignment for major (contiguous) dimensions:\n"
+                        f"  - K must be divisible by 32 (FP4 K-major layout): K%32={'0✓' if k % 32 == 0 else str(k%32)+'✗'}\n"
+                        f"  - Or the combination of (M, N, K, tiling, cluster shape) is not supported\n"
+                        f"Please use backend='auto' to automatically select a compatible backend."
+                    )
+                # else: backend='auto' and CuteDSL doesn't support → silently skip
             elif backend == "cutedsl":
                 raise ValueError(
                     "CuteDSL backend is not available. "
@@ -718,11 +740,40 @@ def get_valid_tactics(self,
     def forward(
         self,
         inputs: List[torch.Tensor],
-        tactic: str = "cutlass",
+        tactic: Union[
+            str, int] = "cutlass",  # str: backend name, or int: -1 for fallback
         **kwargs,
     ) -> torch.Tensor:
         act_fp4, weight, act_sf, weight_scale, alpha = inputs
 
+        # Check if a specific backend was requested
+        requested_backend = kwargs.get('backend', 'auto')
+
+        # If a specific backend was requested (not 'auto') and we're using fallback tactic
+        # This can happen on cache miss, where AutoTuner uses tactic=-1 as default
+        if requested_backend != 'auto' and requested_backend != tactic and tactic == -1:
+            # User explicitly requested a backend, but we're falling back to default
+            # This might happen on cache miss. We should validate the requested backend supports this shape.
+
+            # Get valid tactics for the requested backend
+            from tensorrt_llm._torch.autotuner import OptimizationProfile
+            valid_tactics = self.get_valid_tactics(inputs,
+                                                   OptimizationProfile(),
+                                                   backend=requested_backend)
+
+            if not valid_tactics or requested_backend not in valid_tactics:
+                # Requested backend doesn't support this shape
+                m, n, k = inputs[0].shape[0], inputs[1].shape[
+                    0], inputs[0].shape[1] * 2
+                raise ValueError(
+                    f"Backend '{requested_backend}' was explicitly requested but does not support the current shape:\n"
+                    f"  M={m}, N={n}, K={k}\n"
+                    f"Please use backend='auto' to automatically select a compatible backend."
+                )
+
+            # Backend supports it, use the requested backend instead of fallback
+            tactic = requested_backend
+
         if tactic == "cuda_core":
             # Unswizzle the activation scale factors
             # act_sf is swizzled, need to reverse it for cuda_core_nvfp4_gemm
@@ -844,6 +895,7 @@ def nvfp4_gemm_unified(
     return runner(
         inputs=[act_fp4, weight, act_sf, weight_scale, alpha],
         tactic=best_tactic,
+        backend=backend,
     )
 
 
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py
@@ -300,7 +300,7 @@ def __call__(
         sfa_tensor: cute.Tensor,
         sfb_tensor: cute.Tensor,
         c_tensor: cute.Tensor,
-        alpha: cute.Pointer,  # Changed from cutlass.Float32 to device pointer
+        alpha: cute.Tensor,  # Single-element tensor containing alpha value
         max_active_clusters: cutlass.Constexpr,
         stream: cuda.CUstream,
         epilogue_op: cutlass.Constexpr = lambda x: x,
@@ -571,13 +571,12 @@ def kernel(
             epi_tile: cute.Tile,
             tile_sched_params: utils.PersistentTileSchedulerParams,
             epilogue_op: cutlass.Constexpr,
-            alpha: cute.
-        Pointer,  # Changed from cutlass.Float32 to device pointer
+            alpha: cute.Tensor,  # Single-element tensor containing alpha value
     ):
         """
         GPU device kernel performing the Persistent batched GEMM computation.
         """
-        alpha_value = alpha.load().to(self.c_dtype)
+        alpha_value = alpha[0].to(self.c_dtype)
 
         warp_idx = cute.arch.warp_idx()
         warp_idx = cute.arch.make_warp_uniform(warp_idx)
@@ -1944,7 +1943,8 @@ def __call__(
         a_sf_ptr: cute.Pointer,
         b_sf_ptr: cute.Pointer,
         c_ptr: cute.Pointer,
-        alpha: cute.Pointer,  # Changed from cutlass.Float32 to device pointer
+        alpha: cute.
+        Pointer,  # Device pointer to alpha, will be converted to Tensor
         max_active_clusters: cutlass.Constexpr,
         current_stream: cuda.CUstream,
         swap_ab: cutlass.Constexpr = False,
@@ -1965,7 +1965,7 @@ def __call__(
             a_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for A.
             b_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for B.
             c_ptr (cute.Pointer): Pointer to the C tensor.
-            alpha (cute.Pointer): Pointer to alpha scaling factor on device (avoids CPU-GPU sync).
+            alpha (cute.Pointer): Device pointer to alpha scaling factor (converted to Tensor internally).
             max_active_clusters (cutlass.Constexpr): Maximum number of active
                 clusters.
             current_stream (cuda.CUstream): CUDA stream for the operation.
@@ -2011,11 +2011,17 @@ def __call__(
                                           order=(2, 1, 4, 0, 3, 5),
                                       ))
 
+        # Convert alpha pointer to a single-element cute.Tensor for easier kernel usage
+        # Create a 1D layout with a single element
+        alpha_tensor = cute.make_tensor(alpha,
+                                        layout=cute.make_ordered_layout(
+                                            (1, ), order=(0, )))
+
         Sm100BlockScaledPersistentDenseGemmKernel(
             self.sf_vec_size,
             self.mma_tiler_mn,
             self.cluster_shape_mn,
-        )(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, alpha,
+        )(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, alpha_tensor,
           max_active_clusters, current_stream, epilogue_op)
 
 
diff --git a/tests/unittest/_torch/thop/parallel/test_fp4_linear.py b/tests/unittest/_torch/thop/parallel/test_fp4_linear.py