resolve review comments

Wong4j · Wong4j · commit 205d297bba85 · 2025-12-01T06:41:26.000Z
Signed-off-by: Shijie Wang &lt;jaywan@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -50,8 +50,6 @@ def __init__(self,
                      to_userbuffers: bool = False):
             super().__init__()
 
-
-
             if output_dtype != torch.bfloat16:
                 raise ValueError(
                     f"CuteDSL NVFP4 only supports bfloat16 output, got {output_dtype}"
@@ -242,7 +240,7 @@ def forward(
 
             # Allocate output tensor from UserBuffers or regular CUDA memory
             if self.to_userbuffers:
-                c_tensor, _ = torch.ops.trtllm.create_userbuffers_tensor(
+                c_tensor = torch.ops.trtllm.create_userbuffers_tensor(
                     [m, n], self.output_dtype)
             else:
                 c_tensor = torch.empty(*(m, n),
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -914,9 +914,6 @@ def apply(self, module: Linear, input: torch.Tensor,
             act_fp4, act_sf = torch.ops.trtllm.fp4_quantize(
                 input, module.input_scale, module.scaling_vector_size, False)
 
-        # Backend selection: 'auto' (default) | 'cutlass' | 'cublaslt' | 'cutedsl'
-        backend = getattr(module, 'nvfp4_backend', 'auto')
-
         # Use unified interface - supports CUTLASS, cuBLASLt, CuteDSL
         output = torch.ops.trtllm.nvfp4_gemm(act_fp4,
                                              module.weight,
@@ -925,7 +922,7 @@ def apply(self, module: Linear, input: torch.Tensor,
                                              module.alpha,
                                              module.dtype,
                                              to_userbuffers=False,
-                                             backend=backend)
+                                             backend=module.nvfp4_backend)
         # Take the dim of out_features if padded. Make sure the output is contiguous
         if output.shape[-1] > module.out_features:
             output = output[..., :module.out_features].contiguous()
@@ -2000,6 +1997,12 @@ def __init__(
         fused_weight_shard_indices_mapping: Optional[dict] = None,
         nvfp4_backend: str = "auto",
     ):
+        """
+        Args:
+            nvfp4_backend: Backend selection for NVFP4 GEMM operations.
+                Supported values: "auto", "cutlass", "cublaslt", "cutedsl".
+                Default is "auto" which automatically selects the best backend.
+        """
         from ..distributed import AllReduce
 
         super().__init__()