remove merge_attentions backward (fairinternal/xformers#1402)

bottler · xFormers Bot · commit 8354497deb2c · 2025-07-03T16:52:23.000Z
__original_commit__ = fairinternal/xformers@601197a
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.0.32] - 2025-??-??
+### Removed
+- Removed autograd backward pass for merge_attentions as it is easy to use incorrectly.
 
 ## [0.0.31] - 2025-06-25
 Pre-built binary wheels are available for PyTorch 2.7.1.
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
@@ -8,7 +8,7 @@
 import math
 import random
 from contextlib import nullcontext
-from typing import Any, List, Optional, Sequence, Tuple, Type, TypeVar, Union
+from typing import Any, List, Optional, Sequence, Tuple, Type, TypeVar
 
 import pytest
 import torch
@@ -2814,68 +2814,6 @@ def test_merge_attentions_nobias(
         assert lse is None
 
 
-@disable_on_rocm
-@sm80_or_better_only
-@pytest.mark.parametrize(
-    "op",
-    [
-        pytest.param(fmha.flash.FwOp, id="flashfwd"),
-        pytest.param((fmha.flash.FwOp, fmha.cutlass.BwOp), id="flashcutlass"),
-        # pytest.param((fmha.triton_splitk.FwOp, fmha.cutlass.BwOp), id="splitk"), # XXX
-        pytest.param(fmha.MemoryEfficientAttentionFlashAttentionOp, id="flash"),
-        None,
-    ],
-)
-def test_merge_attentions_nobias_bwd(
-    op: Union[Type[AttentionFwOpBase], fmha.AttentionOp],
-):
-    B, M, Mq, H, K = 13, 5, 5, 4, 128
-    dtype = torch.bfloat16
-    nparts = 3
-    torch.manual_seed(1)
-    q = 3 * torch.rand(B, Mq, H, K, dtype=dtype, device="cuda")
-    kv = [
-        [3 * (torch.rand(B, M, H, K, dtype=dtype, device="cuda")) for _ in range(2)]
-        for _ in range(nparts)
-    ]
-    q = q.requires_grad_(True)
-    kv = [[j.requires_grad_(True) for j in i] for i in kv]
-    out_parts = [fmha.memory_efficient_attention_partial(q, k, v, op=op) for k, v in kv]
-    attn_split, lse_split = [list(x) for x in zip(*out_parts)]
-    out_merged = fmha.merge_attentions(attn_split, lse_split, write_lse=True)[0]
-    grad_out = torch.rand_like(q)
-    out_merged.backward(grad_out)
-    grad_q_out = q.grad
-    assert q.grad is not None
-    grad_kv_out = [[j.grad for j in i] for i in kv]
-    q = q.detach().requires_grad_(True)
-    kv = [[j.detach().requires_grad_(True) for j in i] for i in kv]
-
-    k2, v2 = [torch.cat([i[j] for i in kv], dim=1) for j in range(2)]
-
-    if op is None or isinstance(op, tuple):
-        full_op = op
-    else:
-        full_op = (op, None)
-    out_full = fmha.memory_efficient_attention(q, k2, v2, op=full_op)  # type: ignore
-    out_full.backward(grad_out)
-    assert_allclose(
-        out_merged, out_full.to(out_merged.dtype), rtol=1e-2, atol=2e-2, msg="out"
-    )
-    atol = fmha.AttentionBwOpBase.ERROR_ATOL[dtype] * 1.5
-    rtol = fmha.AttentionBwOpBase.ERROR_RTOL[dtype]
-    assert_allclose(grad_q_out, q.grad, rtol=rtol, atol=atol, msg="qgrad")
-    for i in range(nparts):
-        for j in range(2):
-            assert_allclose(
-                grad_kv_out[i][j],
-                kv[i][j].grad,
-                rtol=rtol,
-                atol=atol,
-                msg=f"kvgrad {i} {j}",
-            )
-
-
 @disable_on_rocm
 @sm80_or_better_only
 @pytest.mark.parametrize(
@@ -3221,15 +3159,7 @@ def test_merge_attentions_sharedinput(
 
 @sm80_or_better_only
 @pytest.mark.parametrize("bmghk", (False, True))
-@pytest.mark.parametrize(
-    "stack_inputs", (False, True), ids=lambda x: "stack_inputs" if x else ""
-)
-@pytest.mark.parametrize(
-    "grad_var", ("lse", "attn", None)
-)  # Gradient with respect to attention, LSE, or neither
-def test_merge_attentions_against_ref(
-    bmghk: bool, stack_inputs: bool, grad_var: Optional[str]
-):
+def test_merge_attentions_against_ref(bmghk: bool):
     split_k = 16
     B = 12
     M = 137
@@ -3245,55 +3175,12 @@ def test_merge_attentions_against_ref(
         attn_split = attn_split[:, :, :, 0]
         lse_split = lse_split[:, :, 0]
 
-    if grad_var is not None:
-        attn_split.requires_grad_(True)
-        lse_split.requires_grad_(True)
-
     attn_out_ref, lse_out_ref = _merge_attentions_ref(attn_split, lse_split)
-    if grad_var is not None:
-        if grad_var == "attn":
-            out_grad = torch.randn_like(attn_out_ref)
-            attn_out_ref.backward(out_grad)
-        else:
-            out_grad = torch.randn_like(lse_out_ref)
-            lse_out_ref.backward(out_grad)
-
-        attn_grad_ref, lse_grad_ref = attn_split.grad, lse_split.grad
-
-        attn_split = attn_split.detach().unbind(0)  # type: ignore
-        lse_split = lse_split.detach().unbind(0)  # type: ignore
-
-        for x in attn_split + lse_split:
-            x.requires_grad_(True)
-            x.retain_grad()
-
     attn_out, lse_out = fmha.merge_attentions(attn_split, lse_split)
 
     torch.testing.assert_close(lse_out, lse_out_ref, rtol=1e-4, atol=1e-4)
     torch.testing.assert_close(attn_out, attn_out_ref, rtol=1e-4, atol=1e-4)
 
-    if grad_var is not None:
-        if grad_var == "attn":
-            attn_out.backward(out_grad)
-        else:
-            assert lse_out is not None
-            lse_out.backward(out_grad)
-
-        attn_grads = [x.grad for x in attn_split]
-        lse_grads = [x.grad for x in lse_split]
-        attn_grad_concat = torch.stack(attn_grads, dim=0)
-        lse_grad_concat = torch.stack(lse_grads, dim=0)
-
-        if grad_var == "lse":
-            # LSE doesn't depend on attn_split, so when only gradient with respect to LSE is provided as input,
-            # the output gradient with respect to attn_split is zero.
-            # The reference implementation produced None instead of zero in this case
-            attn_grad_ref = torch.zeros_like(attn_grad_concat)
-        torch.testing.assert_close(lse_grad_concat, lse_grad_ref, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_close(
-            attn_grad_concat, attn_grad_ref, rtol=1e-4, atol=1e-4
-        )
-
 
 def _merge_attentions_ref(attn_split, lse_split):
     """
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
@@ -794,31 +794,34 @@ def merge_attentions(
         attn_dtype = attn_split[0].dtype
         lse_dtype = lse_split[0].dtype
 
-    attn_out = torch.empty(
-        B,
-        M,
-        G,
-        H,
-        Kq,
-        device=device,
-        dtype=output_dtype or attn_dtype,
-    )
-    if write_lse:
-        lse_out = torch.empty(
+    if concat_path:
+        attn_out = torch.empty(
             B,
+            M,
             G,
             H,
-            M,
+            Kq,
             device=device,
-            dtype=lse_dtype,
+            dtype=output_dtype or attn_dtype,
         )
-    else:
-        lse_out = None
-
-    if concat_path:
+        if write_lse:
+            lse_out = torch.empty(
+                B,
+                G,
+                H,
+                M,
+                device=device,
+                dtype=lse_dtype,
+            )
+        else:
+            lse_out = None
         triton_splitk.merge_attentions(attn_out, lse_out, attn_split, lse_split)  # type: ignore
     else:
-        attn_out, lse_out = _MergeAttentions.apply(attn_out, lse_out, *attn_split, *lse_split)  # type: ignore
+        outs = triton_splitk.merge_attentions_varargs(
+            attn_split, lse_split, write_lse, output_dtype, B, M, G, H, Kq
+        )  # type: ignore
+        attn_out = outs[0]
+        lse_out = outs[1] if write_lse else None
 
     if is_bmhk:
         attn_out = attn_out[:, :, 0]
@@ -828,44 +831,6 @@ def merge_attentions(
     return attn_out, lse_out
 
 
-class _MergeAttentions(torch.autograd.Function):
-    @staticmethod
-    # type: ignore
-    def forward(
-        ctx, attn_out: torch.Tensor, lse_out: torch.Tensor, *inputs: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        num_chunks = len(inputs) // 2
-        attn_split, lse_split = inputs[:num_chunks], inputs[num_chunks:]
-
-        triton_splitk.merge_attentions_varargs(attn_out, lse_out, attn_split, lse_split)
-
-        ctx.save_for_backward(
-            attn_out,
-            lse_out,
-            *inputs,
-        )
-        return attn_out, lse_out
-
-    @staticmethod
-    # type: ignore
-    def backward(
-        ctx, grad_attn: torch.Tensor, grad_lse: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], ...]:
-        out, lse, *inputs = ctx.saved_tensors
-        num_chunks = len(inputs) // 2
-        attn_split, lse_split = inputs[:num_chunks], inputs[num_chunks:]
-        dattn, dlse = triton_splitk.merge_attentions_varargs_backward(
-            attn_split,
-            lse_split,
-            out,
-            lse,
-            grad_attn,
-            grad_lse,
-        )
-        ret = [None, None] + dattn + dlse
-        return tuple(ret)
-
-
 ALL_FW_OPS: List[Type[AttentionFwOpBase]] = [
     cutlass.FwOp if torch.version.cuda else ck.FwOp,
     flash.FwOp,
diff --git a/xformers/ops/fmha/triton_splitk.py b/xformers/ops/fmha/triton_splitk.py
@@ -1047,19 +1047,37 @@ def merge_attentions(
 
 @torch.library.custom_op(
     "xformers::fmha_merge_attentions_varargs",
-    mutates_args=("attn_out", "lse_out"),
+    mutates_args=(),
     device_types=["cuda"],
 )
 def merge_attentions_varargs(
-    attn_out: torch.Tensor,
-    lse_out: Optional[torch.Tensor],
     attn_split: Sequence[torch.Tensor],
     lse_split: Sequence[torch.Tensor],
-) -> None:
+    write_lse: bool,
+    output_dtype: Optional[torch.dtype],
+    B: int,
+    M: int,
+    G: int,
+    H: int,
+    Kq: int,
+) -> List[torch.Tensor]:
     from xformers.triton.vararg_kernel import unroll_varargs
 
     from ._triton.splitk_kernels import _splitK_reduce_varargs
 
+    attn_out = torch.empty(
+        (B, M, G, H, Kq),
+        device=attn_split[0].device,
+        dtype=output_dtype or attn_split[0].dtype,
+    )
+    if write_lse:
+        lse_out = torch.empty(
+            (B, G, H, M),
+            device=attn_split[0].device,
+            dtype=lse_split[0].dtype,
+        )
+    else:
+        lse_out = None
     kernel_args, grid = _prepare_reduce_kernel_params(
         attn_out, lse_out, attn_split, lse_split
     )
@@ -1073,16 +1091,52 @@ def merge_attentions_varargs(
         BLOCK_SIZE=attn_out.shape[-1],
         WRITE_LSE=lse_out is not None,
     )
+    if write_lse:
+        assert lse_out is not None
+        return [attn_out, lse_out]
+    return [attn_out]
 
 
 @torch.library.register_fake("xformers::fmha_merge_attentions_varargs")
 def merge_attentions_varargs_fake(
-    attn_out: torch.Tensor,
-    lse_out: Optional[torch.Tensor],
     attn_split: Sequence[torch.Tensor],
     lse_split: Sequence[torch.Tensor],
-) -> None:
-    return
+    write_lse: bool,
+    output_dtype: Optional[torch.dtype],
+    B: int,
+    M: int,
+    G: int,
+    H: int,
+    Kq: int,
+) -> List[torch.Tensor]:
+    attn_out = torch.empty(
+        (B, M, G, H, Kq),
+        device=attn_split[0].device,
+        dtype=output_dtype or attn_split[0].dtype,
+    )
+    if write_lse:
+        lse_out = torch.empty(
+            (B, G, H, M),
+            device=attn_split[0].device,
+            dtype=lse_split[0].dtype,
+        )
+        return [attn_out, lse_out]
+    return [attn_out]
+
+
+def _merge_attentions_backward(
+    ctx: torch.autograd.function.FunctionCtx,
+    grad: List[torch.Tensor],
+) -> Tuple[None, ...]:
+    raise NotImplementedError(
+        "Backward pass is not implemented for merge_attentions. "
+        "If it was, it would be easy to get wrong attention gradients, "
+        "because the gradients of the LSEs "
+        "don't get propagated by attention backward."
+    )
+
+
+merge_attentions_varargs.register_autograd(_merge_attentions_backward)
 
 
 @torch.library.custom_op(