NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/pybind/thop/bindings.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/pybind/thop/bindings.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/thop/moeAlltoAllMeta.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/thop/moeAlltoAllMeta.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp‎
Lines changed: 19 additions & 19 deletions b/‎cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎tensorrt_llm/_torch/autotuner.py‎
Lines changed: 59 additions & 38 deletions b/‎tensorrt_llm/_torch/autotuner.py‎
Lines changed: 59 additions & 38 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 0 additions & 1 deletion b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 10 additions & 10 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 10 additions & 10 deletions
@@ -23,7 +23,7 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace tensorrt_llm::kernels::mnnvl_throughput
+namespace tensorrt_llm::kernels::moe_comm
 {
 
 #define ENABLE_DEBUG_PRINT 0
@@ -964,4 +964,4 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv
         expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id);
 }
 
-} // namespace tensorrt_llm::kernels::mnnvl_throughput
+} // namespace tensorrt_llm::kernels::moe_comm
@@ -19,7 +19,7 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 
-namespace tensorrt_llm::kernels::mnnvl_throughput
+namespace tensorrt_llm::kernels::moe_comm
 {
 
 // Configuration constants
@@ -177,4 +177,4 @@ void moe_a2a_prepare_combine_launch(MoeA2ACombineParams const& params);
 void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv_counters, int32_t invalid_id,
     int ep_size, int max_tokens_per_rank, int top_k, cudaStream_t stream);
 
-} // namespace tensorrt_llm::kernels::mnnvl_throughput
+} // namespace tensorrt_llm::kernels::moe_comm
@@ -30,7 +30,7 @@ namespace tensorrt_llm::nanobind::thop
 void initBindings(nb::module_& m)
 {
     // Export MoE A2A constants
-    for (auto const& kv : torch_ext::mnnvl_throughput::getMoeA2AMetaInfoIndexPairs())
+    for (auto const& kv : torch_ext::moe_comm::getMoeA2AMetaInfoIndexPairs())
     {
         m.attr(kv.first) = kv.second;
     }
 
@@ -30,7 +30,7 @@ namespace tensorrt_llm::pybind::thop
 void initBindings(pybind11::module_& m)
 {
     // Export MoE A2A constants
-    for (auto const& kv : torch_ext::mnnvl_throughput::getMoeA2AMetaInfoIndexPairs())
+    for (auto const& kv : torch_ext::moe_comm::getMoeA2AMetaInfoIndexPairs())
     {
         m.attr(kv.first) = py::int_(kv.second);
     }
 
@@ -23,7 +23,7 @@
 
 namespace torch_ext
 {
-namespace mnnvl_throughput
+namespace moe_comm
 {
 
 // Enum for indexing into moe_a2a_metainfo tensor
@@ -61,5 +61,5 @@ inline std::vector<std::pair<char const*, int64_t>> getMoeA2AMetaInfoIndexPairs(
     };
 }
 
-} // namespace mnnvl_throughput
+} // namespace moe_comm
 } // namespace torch_ext
@@ -28,7 +28,7 @@
 namespace torch_ext
 {
 
-namespace mnnvl_throughput
+namespace moe_comm
 {
 
 // TODO: Is Alignment necessary?
@@ -78,13 +78,13 @@ MoeA2ADataOffsets calculateOffsets(int epSize, int maxNumTokens)
     // topk_target_ranks: [maxNumTokens, kMaxTopK]
     offset = alignOffset(offset, CACHELINE_ALIGNMENT);
     offsets[TOPK_TARGET_RANKS_OFFSET_INDEX] = offset;
-    offset += static_cast<size_t>(maxNumTokens) * static_cast<size_t>(tensorrt_llm::kernels::mnnvl_throughput::kMaxTopK)
+    offset += static_cast<size_t>(maxNumTokens) * static_cast<size_t>(tensorrt_llm::kernels::moe_comm::kMaxTopK)
         * SIZEOF_INT32;
 
     // topk_send_indices: [maxNumTokens, kMaxTopK]
     offset = alignOffset(offset, CACHELINE_ALIGNMENT);
     offsets[TOPK_SEND_INDICES_OFFSET_INDEX] = offset;
-    offset += static_cast<size_t>(maxNumTokens) * static_cast<size_t>(tensorrt_llm::kernels::mnnvl_throughput::kMaxTopK)
+    offset += static_cast<size_t>(maxNumTokens) * static_cast<size_t>(tensorrt_llm::kernels::moe_comm::kMaxTopK)
         * SIZEOF_INT32;
 
     // payload data
@@ -165,11 +165,11 @@ std::tuple<std::vector<torch::Tensor>, int64_t> moeA2ADispatchOp(torch::Tensor c
     std::vector<torch::Tensor> const& inputPayloads, torch::Tensor const& workspace, torch::Tensor const& metainfo,
     int64_t runtimeMaxTokensPerRank, int64_t epRank, int64_t epSize, int64_t topK, int64_t numExperts)
 {
-    using tensorrt_llm::kernels::mnnvl_throughput::PayloadDescriptor;
-    using tensorrt_llm::kernels::mnnvl_throughput::MoeA2ADispatchParams;
-    using tensorrt_llm::kernels::mnnvl_throughput::moe_a2a_dispatch_launch;
-    using tensorrt_llm::kernels::mnnvl_throughput::kMaxTopK;
-    using tensorrt_llm::kernels::mnnvl_throughput::kMaxPayloads;
+    using tensorrt_llm::kernels::moe_comm::PayloadDescriptor;
+    using tensorrt_llm::kernels::moe_comm::MoeA2ADispatchParams;
+    using tensorrt_llm::kernels::moe_comm::moe_a2a_dispatch_launch;
+    using tensorrt_llm::kernels::moe_comm::kMaxTopK;
+    using tensorrt_llm::kernels::moe_comm::kMaxPayloads;
 
     // Validate inputs
     CHECK_INPUT(tokenSelectedExperts, torch::kInt32);
@@ -344,9 +344,9 @@ torch::Tensor moeA2ACombineOp(torch::Tensor const& payload, int64_t localNumToke
     torch::Tensor const& metainfo, int64_t runtimeMaxTokensPerRank, int64_t epRank, int64_t epSize, int64_t topK,
     int64_t combinePayloadOffset, bool payloadInWorkspace)
 {
-    using tensorrt_llm::kernels::mnnvl_throughput::MoeA2ACombineParams;
-    using tensorrt_llm::kernels::mnnvl_throughput::moe_a2a_combine_launch;
-    using tensorrt_llm::kernels::mnnvl_throughput::kMaxTopK;
+    using tensorrt_llm::kernels::moe_comm::MoeA2ACombineParams;
+    using tensorrt_llm::kernels::moe_comm::moe_a2a_combine_launch;
+    using tensorrt_llm::kernels::moe_comm::kMaxTopK;
 
     // Validate inputs
     CHECK_TH_CUDA(payload);
@@ -474,8 +474,8 @@ void moeA2ASanitizeExpertIdsOp(torch::Tensor& expert_ids, torch::Tensor& workspa
     uint8_t* rankWorkSpacePtr = workspace.data_ptr<uint8_t>() + epRank * workspace.stride(0);
     int* recv_counters = reinterpret_cast<int*>(rankWorkSpacePtr + offsets[RECV_COUNTERS_OFFSET_INDEX]);
 
-    tensorrt_llm::kernels::mnnvl_throughput::moe_a2a_sanitize_expert_ids_launch(expert_ids.data_ptr<int32_t>(),
-        recv_counters, static_cast<int32_t>(invalid_expert_id), ep_size, runtime_max_tokens_per_rank, top_k,
+    tensorrt_llm::kernels::moe_comm::moe_a2a_sanitize_expert_ids_launch(expert_ids.data_ptr<int32_t>(), recv_counters,
+        static_cast<int32_t>(invalid_expert_id), ep_size, runtime_max_tokens_per_rank, top_k,
         at::cuda::getCurrentCUDAStream());
 }
 
@@ -508,7 +508,7 @@ torch::Tensor moeA2AGetCombinePayloadTensorOp(torch::Tensor const& workspace, in
     return t;
 }
 
-} // namespace mnnvl_throughput
+} // namespace moe_comm
 
 } // namespace torch_ext
 
@@ -540,9 +540,9 @@ TORCH_LIBRARY_FRAGMENT(trtllm, module)
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, module)
 {
-    module.impl("moe_a2a_dispatch", &torch_ext::mnnvl_throughput::moeA2ADispatchOp);
-    module.impl("moe_a2a_combine", &torch_ext::mnnvl_throughput::moeA2ACombineOp);
-    module.impl("moe_a2a_initialize", &torch_ext::mnnvl_throughput::moeA2AInitializeOp);
-    module.impl("moe_a2a_sanitize_expert_ids", &torch_ext::mnnvl_throughput::moeA2ASanitizeExpertIdsOp);
-    module.impl("moe_a2a_get_combine_payload_tensor", &torch_ext::mnnvl_throughput::moeA2AGetCombinePayloadTensorOp);
+    module.impl("moe_a2a_dispatch", &torch_ext::moe_comm::moeA2ADispatchOp);
+    module.impl("moe_a2a_combine", &torch_ext::moe_comm::moeA2ACombineOp);
+    module.impl("moe_a2a_initialize", &torch_ext::moe_comm::moeA2AInitializeOp);
+    module.impl("moe_a2a_sanitize_expert_ids", &torch_ext::moe_comm::moeA2ASanitizeExpertIdsOp);
+    module.impl("moe_a2a_get_combine_payload_tensor", &torch_ext::moe_comm::moeA2AGetCombinePayloadTensorOp);
 }
@@ -99,7 +99,7 @@ class TuningConfig:
     constraint_specs: Tuple[ConstraintSpec, ...] = ()
     tune_max_num_tokens: int = None
     inputs_pre_hook: Callable = None
-    use_cuda_graph: bool = False
+    use_cuda_graph: bool = True
 
 
 @dataclass(unsafe_hash=True)
@@ -526,7 +526,7 @@ class AutoTuner:
     _CUDA_GRAPH_DELAY_MICRO_SECS = 100
     _instance = None
 
-    def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
+    def __init__(self, warmup=2, repeat=10, stream_delay_micro_secs=1000):
         self.repeat = repeat
         self.warmup = warmup
         self.stream_delay_micro_secs = stream_delay_micro_secs
@@ -698,23 +698,25 @@ def choose_one(
             })
 
         input_shapes = tuple(self._get_input_sizes(inputs))
+        is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache(
+            custom_op, runners, input_shapes, tuning_config)
+
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
-            is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache(
-                custom_op, runners, input_shapes, tuning_config)
             best_runner = runners[best_runner_id]
             # TODO: check the stored runner and tactic can implement this shape here
-            # Should not directly try (runner, tactic) here, or it will hurt a lot of inference perf.
-
-            # Record the cache miss config.
-            # Expect no cache miss in inference. Thus, any cache miss should be recorded.
+            # Log the cache miss. Expect no cache miss in inference.
             if not is_cache_hit:
                 logger.warning_once(
                     f"[AutoTunner] Using the fallback tactic, due to cache miss on input shapes={input_shapes}",
                     key=(custom_op, "warning_autotuning_cache_miss_fallback"))
 
             return (best_runner, best_tactic)
 
+        # If it's tuning mode and cache hit, return the best runner and tactic to avoid redundant profiling.
+        if self.is_tuning_mode and is_cache_hit:
+            return (runners[best_runner_id], best_tactic)
+
         assert len(runners) > 0, "At least one runner is required"
         assert all([isinstance(r, TunableRunner) for r in runners]), \
             "All Given runners must be subclass of TunableRunner"
@@ -881,43 +883,62 @@ def _profile_single_kernel(
             are used to ensure accurate timing.
         """
         stream = torch.cuda.current_stream()
-        graph = torch.cuda.CUDAGraph()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-
-        with torch.cuda.stream(stream):
-            # warm up, no timing
-            for _ in range(self.warmup):
-                runner(inputs, tactic=tactic, **kwargs)
-
-            if use_cuda_graph:
-                with torch.cuda.graph(graph):
-                    for _ in range(self.repeat):
-                        runner(inputs, tactic=tactic, **kwargs)
+        # If the warm up time is longer than 0.5ms, we will profile the kernel with fewer repeats.
+        profile_fewer_repeat = 2
+        short_profile_threshold_ms = 1
+
+        avg_time = float('inf')
+
+        def pure_profile(stream: torch.cuda.Stream, repeat: int):
+            start = torch.cuda.Event(enable_timing=True)
+            end = torch.cuda.Event(enable_timing=True)
+            graph = torch.cuda.CUDAGraph()
+
+            with torch.cuda.stream(stream):
+                if use_cuda_graph:
+                    with torch.cuda.graph(graph):
+                        for _ in range(repeat):
+                            runner(inputs, tactic=tactic, **kwargs)
+
+                stream.synchronize()
+
+                # Delay the profiled kernel launch to eliminate affects of host time overhead in profiling.
+                # TODO: This is build time sensitive, O(tactic_num * impl_num * num_profile * tunable_ops)
+                # Consider apply a preprofiling to estimate the kernel execution time, then decide the necessity.
+                if use_cuda_graph:
+                    delay_kernel(self._CUDA_GRAPH_DELAY_MICRO_SECS, stream)
+                else:
+                    delay_kernel(self.stream_delay_micro_secs, stream)
 
-            stream.synchronize()
+                start.record()
 
-            # Delay the profiled kernel launch to eliminate affects of host time overhead in profiling.
-            # TODO: This is build time sensitive, O(tactic_num * impl_num * num_profile * tunable_ops)
-            # Consider apply a preprofiling to estimate the kernel execution time, then decide the necessity.
-            if use_cuda_graph:
-                delay_kernel(self._CUDA_GRAPH_DELAY_MICRO_SECS, stream)
-            else:
-                delay_kernel(self.stream_delay_micro_secs, stream)
+                if use_cuda_graph:
+                    graph.replay()
+                else:
+                    for _ in range(repeat):
+                        runner(inputs, tactic=tactic, **kwargs)
 
-            start.record()
+                end.record()
+                stream.synchronize()
 
-            if use_cuda_graph:
-                graph.replay()
-            else:
-                for _ in range(self.repeat):
-                    runner(inputs, tactic=tactic, **kwargs)
+                return start.elapsed_time(end) / repeat
 
-            end.record()
+        for _ in range(self.warmup):
+            runner(inputs, tactic=tactic, **kwargs)
 
-        stream.synchronize()
+        fewer_repeat_avg_time = pure_profile(stream, profile_fewer_repeat)
 
-        avg_time = start.elapsed_time(end) / self.repeat
+        disable_short_profile = os.environ.get(
+            "TLLM_AUTOTUNER_DISABLE_SHORT_PROFILE", "0") == "1"
+        if fewer_repeat_avg_time > short_profile_threshold_ms and not disable_short_profile:
+            print(
+                f"[Autotuner] Few repeat estimated time is longer than {short_profile_threshold_ms}ms, directly use the few repeat estimated time to avoid redundant profiling."
+            )
+            # directly use the few repeat estimated time to avoid redundant profiling
+            avg_time = fewer_repeat_avg_time
+        else:
+            # profile the kernel with the full repeat to get precise time
+            avg_time = pure_profile(stream, self.repeat)
 
         shapes = self._get_input_sizes(inputs)
         logger.debug(
 
@@ -40,7 +40,6 @@ class CuteDSLNVFP4BlackwellRunner(TunableRunner):
                 0, 0, get_last_power_of_2_num_tokens_buckets,
                 last_positive_power_of_2), ),
             constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ),
-            use_cuda_graph=True,
         )
 
         def __init__(self, alpha: float, output_dtype: torch.dtype):
 
@@ -143,13 +143,13 @@ def __init__(
             self.use_low_precision_combine = model_config.use_low_precision_moe_combine
 
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
-                if self.moe_alltoall_backend == "mnnvllatency":
+                if self.moe_alltoall_backend == "NVLINK_TWO_SIDED":
                     MnnvlMemory.initialize()
                     self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
                         model_config.mapping)
                     self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
                         model_config.mapping)
-                elif self.moe_alltoall_backend == "mnnvlthroughput":
+                elif self.moe_alltoall_backend == "NVLINK_ONE_SIDED":
                     workspace_mb = int(
                         os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "2048"))
                     self.moe_a2a = MoeAlltoAll(
@@ -253,9 +253,9 @@ def enable_alltoall(self):
 
     @cached_property
     def moe_alltoall_backend(self):
-        # "mnnvlthroughput" (default) or "mnnvllatency"
+        # "NVLINK_ONE_SIDED" (default) or "NVLINK_TWO_SIDED"
         return os.environ.get("TRTLLM_MOE_ALLTOALL_BACKEND",
-                              "mnnvlthroughput").strip().lower()
+                              "NVLINK_ONE_SIDED").strip().upper()
 
     def _supports_load_balancer(self) -> bool:
         """CutlassFusedMoE supports load balancer."""
@@ -328,7 +328,7 @@ def forward_chunk(
 
         if self.layer_load_balancer:
             self._load_balancer_done_wait_gpu_stage(is_first_call)
-            ignore_allreduce = self.enable_alltoall and self.alltoall_method_type == AlltoallMethodType.MNNVL and self.moe_alltoall_backend == "mnnvllatency"
+            ignore_allreduce = self.enable_alltoall and self.alltoall_method_type == AlltoallMethodType.MNNVL and self.moe_alltoall_backend == "NVLINK_TWO_SIDED"
             self._load_balancer_update_statistic(
                 token_selected_experts,
                 is_first_call,
@@ -439,7 +439,7 @@ def forward_chunk(
                 token_final_scales = torch.ones_like(token_selected_slots,
                                                      dtype=torch.float32)
 
-            if self.moe_alltoall_backend == "mnnvllatency":
+            if self.moe_alltoall_backend == "NVLINK_TWO_SIDED":
                 assert self.alltoall_prepare_workspace is not None, "alltoall_prepare_workspace should be initialized"
                 if is_last_call:
                     loadbalancer_local_statistic_info = self._load_balancer_get_local_statistic_tensor(
@@ -472,7 +472,7 @@ def forward_chunk(
                     token_selected_slots, alltoall_info.recv_rank_count_cumsum,
                     runtime_max_tokens_per_rank, top_k, self.num_slots,
                     self.ep_size)
-            elif self.moe_alltoall_backend == "mnnvlthroughput":
+            elif self.moe_alltoall_backend == "NVLINK_ONE_SIDED":
                 # Python MoeAlltoAll path
                 if x_sf is not None:
                     x_sf = x_sf.view(x_row,
@@ -532,7 +532,7 @@ def forward_chunk(
 
         # Optionally provide an output tensor to fused_moe so it writes directly to our buffer
         moe_output: Optional[torch.Tensor] = None
-        if self.enable_alltoall and self.moe_alltoall_backend == "mnnvlthroughput":
+        if self.enable_alltoall and self.moe_alltoall_backend == "NVLINK_ONE_SIDED":
             # Retrieve a workspace-backed output tensor sized by runtime tokens
             runtime_max_tokens_per_rank = max(
                 all_rank_num_tokens) if all_rank_num_tokens else x.shape[0]
@@ -583,7 +583,7 @@ def forward_chunk(
 
         # Combine results if using alltoall
         if self.enable_alltoall:
-            if self.moe_alltoall_backend == "mnnvllatency":
+            if self.moe_alltoall_backend == "NVLINK_TWO_SIDED":
                 if alltoall_info is not None:
                     top_k = self.routing_method.experts_per_token
                     final_hidden_states = MnnvlMoe.mnnvl_moe_alltoallv_combine(
@@ -596,7 +596,7 @@ def forward_chunk(
                         use_low_precision_combine=self.
                         use_low_precision_combine,
                         token_count=token_count)
-            elif self.moe_alltoall_backend == "mnnvlthroughput":
+            elif self.moe_alltoall_backend == "NVLINK_ONE_SIDED":
                 output_hidden_size = final_hidden_states.shape[-1]
                 runtime_max_tokens_per_rank = max(
                     all_rank_num_tokens) if all_rank_num_tokens else token_count
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`#include <cstdint>`
`24`	`24`	`#include <type_traits>`
`25`	`25`
`26`		`-namespace tensorrt_llm::kernels::mnnvl_throughput`
	`26`	`+namespace tensorrt_llm::kernels::moe_comm`
`27`	`27`	`{`
`28`	`28`
`29`	`29`	`#define ENABLE_DEBUG_PRINT 0`
`@@ -964,4 +964,4 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv`
`964`	`964`	`expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id);`
`965`	`965`	`}`
`966`	`966`
`967`		`-} // namespace tensorrt_llm::kernels::mnnvl_throughput`
	`967`	`+} // namespace tensorrt_llm::kernels::moe_comm`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ namespace tensorrt_llm::nanobind::thop`
`30`	`30`	`void initBindings(nb::module_& m)`
`31`	`31`	`{`
`32`	`32`	`// Export MoE A2A constants`
`33`		`- for (auto const& kv : torch_ext::mnnvl_throughput::getMoeA2AMetaInfoIndexPairs())`
	`33`	`+ for (auto const& kv : torch_ext::moe_comm::getMoeA2AMetaInfoIndexPairs())`
`34`	`34`	`{`
`35`	`35`	`m.attr(kv.first) = kv.second;`
`36`	`36`	`}`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,6 @@ class CuteDSLNVFP4BlackwellRunner(TunableRunner):`
`40`	`40`	`0, 0, get_last_power_of_2_num_tokens_buckets,`
`41`	`41`	`last_positive_power_of_2), ),`
`42`	`42`	`constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ),`
`43`		`- use_cuda_graph=True,`
`44`	`43`	`)`
`45`	`44`
`46`	`45`	`def __init__(self, alpha: float, output_dtype: torch.dtype):`