chore: fp8 quantization in MHA

keehyuna · keehyuna · commit 62cb0f241e57 · 2025-08-26T06:55:25.000Z
diff --git a/tools/llm/torchtrt_ext/sdpa_converter.py b/tools/llm/torchtrt_ext/sdpa_converter.py
@@ -69,6 +69,7 @@ def scaled_dot_product_attention(
     is_causal = True
     # implementation as described here: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
     use_fp32_acc = kwargs.get("use_fp32_acc", False)
+    use_fp8_quantize = kwargs.get("use_fp8_quantize", True)
     query_dtype = query.dtype
 
     if scale is None:
@@ -97,6 +98,30 @@ def scaled_dot_product_attention(
             key,
             scale,
         )
+    # fixed value for test
+    amax = torch.tensor([0.6562])
+    if use_fp8_quantize:
+        key = impl.quantize.quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            key,
+            amax,
+            8,
+            4,
+        )
+
+        query = impl.quantize.quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            query,
+            amax,
+            8,
+            4,
+        )
 
     if use_fp32_acc and query_dtype == trt.float16:
         query = cast_trt_tensor(
@@ -173,6 +198,29 @@ def scaled_dot_product_attention(
     softmax = impl.normalization.softmax(
         ctx, target, source_ir, name + "_softmax", scaled_add_attn_bias, -1, False
     )
+    if use_fp8_quantize:
+        softmax = impl.quantize.quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            softmax,
+            amax,
+            8,
+            4,
+        )
+
+        value = impl.quantize.quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            value,
+            amax,
+            8,
+            4,
+        )
+
     if use_fp32_acc:
         softmax = cast_trt_tensor(
             ctx, softmax, trt.float32, name + "_softmax_cast_to_fp32", target, source_ir
@@ -188,9 +236,21 @@ def scaled_dot_product_attention(
         softmax,
         value,
     )
+
     if use_fp32_acc:
         out = cast_trt_tensor(
             ctx, out, query_dtype, name + "_out_cast_to_fp16", target, source_ir
         )
+    if use_fp8_quantize:
+        out = impl.quantize.quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            out,
+            amax,
+            8,
+            4,
+        )
 
     return out