re-submit 12911 but relax the requirement for deepgemm (#13226)

zminglei · web-flow · commit 8a4373405ed2 · 2025-11-15T15:37:12.000+08:00
diff --git a/python/sglang/srt/batch_invariant_ops/batch_invariant_ops.py b/python/sglang/srt/batch_invariant_ops/batch_invariant_ops.py
@@ -241,7 +241,15 @@ def _matmul_persistent_deepgemm(
     dtype = a.dtype
     out = torch.empty((M, N), device=a.device, dtype=dtype)
 
-    deep_gemm.bf16_gemm_nn(a, b, out)
+    try:
+        deep_gemm.bf16_gemm_nn(a, b, out)
+    except RuntimeError as e:
+        raise RuntimeError(
+            f"DeepGEMM failed for matrix shapes M={M}, N={N}, K={K}. "
+            f"This typically occurs when dimensions are too small for DeepGEMM's TMA descriptors. "
+            f"Consider increasing MIN_DEEPGEMM_DIM in matmul_persistent() or disabling DeepGEMM "
+            f"for small matrices. Original error: {e}"
+        ) from e
 
     # TODO can this be put in DeepGEMM's `c`?
     if bias is not None:
@@ -253,13 +261,19 @@ def _matmul_persistent_deepgemm(
 def matmul_persistent(
     a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
 ):
+    K, N = b.shape
+
+    # DeepGEMM has minimum dimension requirements for TMA descriptors
+    MIN_DEEPGEMM_DIM = 16
+
     if (
         _ENABLE_MM_DEEPGEMM
         and ENABLE_JIT_DEEPGEMM
         and (a.dtype == torch.bfloat16)
         and (b.dtype == torch.bfloat16)
         and a.is_contiguous()
         and b.transpose(0, 1).is_contiguous()
+        and N >= MIN_DEEPGEMM_DIM
     ):
         if _ENABLE_MM_COMPARISON_TEST:
             out_triton = _matmul_persistent_triton(a=a, b=b, bias=bias)