pytorch · RandySheriff · Aug 25, 2025 · Aug 25, 2025
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -149,6 +149,16 @@ def get_configs_io_bound() -> List[Config]:
     return configs
 
 
+def dummy_prune_configs(configs, named_args, **kwargs):
+
+    M = named_args["M"]
+    N = named_args["N"]
+    K = named_args["K"]
+
+    logger.info(f"{len(configs)=} {len(configs)=} for {M=} {N=} {K=}")
+    return configs
+
+
 MATMUL_CONFIGS: List[Config] = [
     # basic configs for compute-bound matmuls
     Config(
@@ -176,6 +186,11 @@ def get_configs_io_bound() -> List[Config]:
         num_stages=4,
         num_warps=4,
     ),
+    Config(
+        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 256, "SPLIT_K": 1},
+        num_stages=4,
+        num_warps=4,
+    ),
     Config(
         {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1},
         num_stages=4,
@@ -252,6 +267,9 @@ def get_configs_io_bound() -> List[Config]:
 
 @triton.autotune(
     configs=MATMUL_CONFIGS,
+    prune_configs_by={
+        "early_config_prune": dummy_prune_configs,
+    },
     key=[
         "m_key",
         "n_key",
@@ -1227,6 +1245,8 @@ def matmul_fp8_row(
     a = a.view(-1, a.size(-1))
     # View inputs into proper torch fp8 dtype.
     if torch.version.cuda:
+        # Enable tf32 tensor core to accelerate CUDA
+        torch.set_float32_matmul_precision("high")
         assert a.dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
     else:
         assert a.dtype in (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz)