Small change to scale dot attn

yizhuoz004 · yizhuoz004 · commit b5a9c9ce69d8 · 2024-12-18T11:08:04.000-08:00
diff --git a/tripy/examples/diffusion/clip_model.py b/tripy/examples/diffusion/clip_model.py
@@ -66,7 +66,7 @@ def __call__(self, hidden_states, causal_attention_mask):
             for x in (q, k, v)
         ]
         attn_output = scaled_dot_product_attention(
-            q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask, dtype=self.dtype,
+            q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask,
         )
         out = self.out_proj(tp.reshape(tp.transpose(attn_output, 1, 2), (bsz, tgt_len, embed_dim)))
         return out
diff --git a/tripy/examples/diffusion/helper.py b/tripy/examples/diffusion/helper.py
@@ -1,6 +1,5 @@
 import math
-from functools import reduce
-from typing import List, Callable, Optional
+from typing import Optional
 
 import tripy as tp
 
@@ -9,29 +8,17 @@ def scaled_dot_product_attention(
     query: tp.Tensor,
     key: tp.Tensor,
     value: tp.Tensor,
-    embedding_dim: Optional[int] = None,
+    embedding_dim: int,
     attn_mask: Optional[tp.Tensor] = None,
-    is_causal: bool = False,
-    dtype: tp.dtype = tp.float32
 ) -> tp.Tensor:
-    """
-    Computes scaled dot-product attention.
-    `self` is the query tensor, `key` is the key tensor, and `value` is the value tensor.
-
-    - Described: https://paperswithcode.com/method/scaled
-    - Paper: https://arxiv.org/abs/1706.03762v7
-    """
-    if is_causal:  # this path is not called in demoDiffusion
-        target_shape = query.shape[-2:-1] + key.shape[-2:-1]
-        # TODO: #228: WAR to prevent computing output rank in infer_rank for reshape
-        target_shape.trace_tensor.shape = (2,)
-        attn_mask = tp.cast(tp.tril(tp.ones(target_shape)), tp.bool)
+    dtype = query.dtype
     if attn_mask is not None and attn_mask.dtype == tp.bool:
         attn_mask = tp.where((attn_mask == 0), tp.ones_like(attn_mask, dtype=dtype) * -float("inf"), tp.zeros_like(attn_mask, dtype=dtype))
     if attn_mask is not None:
         attn_mask = tp.cast(attn_mask, dtype)
-    qk = query @ tp.transpose(key, -2, -1) / math.sqrt(embedding_dim)
-    return tp.cast(tp.softmax((qk + attn_mask) if attn_mask is not None else qk, -1), query.dtype) @ value
+    k_t = tp.transpose(key, -2, -1)
+    qk = (query @ k_t) * (1.0 / math.sqrt(embedding_dim))
+    return tp.softmax((qk + attn_mask) if attn_mask is not None else qk, -1) @ value
 
 
 def clamp(tensor: tp.Tensor, min: int, max: int):
diff --git a/tripy/examples/diffusion/model.py b/tripy/examples/diffusion/model.py
@@ -87,7 +87,7 @@ def decode(self, x):
         x = clamp(tp.permute(tp.reshape(x, (3, 512, 512)), (1, 2, 0)), 0, 1) * 255
         return x
 
-    def __call__(self, unconditional_context, context, latent, timestep, alphas, alphas_prev, guidance):
+    def __call__(self, unconditional_context, context, latent, timestep, alphas, alphas_prev, guidance):        
         e_t = self.get_model_output(unconditional_context, context, latent, timestep, guidance)
         x_prev, _ = self.get_x_prev_and_pred_x0(latent, e_t, alphas, alphas_prev)
         return x_prev
diff --git a/tripy/examples/diffusion/unet_model.py b/tripy/examples/diffusion/unet_model.py
@@ -80,7 +80,7 @@ def __call__(self, x, context=None):
             tp.transpose(tp.reshape(y, (x.shape[0], -1, self.num_heads, self.head_size)), 1, 2) for y in (q, k, v)
         ]
         attention = tp.transpose(
-            scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size, dtype=self.dtype), 1, 2
+            scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size), 1, 2
         )
         h_ = tp.reshape(attention, (x.shape[0], -1, self.num_heads * self.head_size))
         out = self.to_out(h_)
diff --git a/tripy/examples/diffusion/vae_model.py b/tripy/examples/diffusion/vae_model.py
@@ -51,7 +51,7 @@ def __call__(self, x):
         q, k, v = self.to_q(h_flat), self.to_k(h_flat), self.to_v(h_flat)
 
         # compute attention
-        h_ = scaled_dot_product_attention(q, k, v, embedding_dim=self.in_channels, dtype=self.dtype)
+        h_ = scaled_dot_product_attention(q, k, v, embedding_dim=self.in_channels)
         out = tp.reshape(
             tp.transpose(self.to_out[0](h_), 1, 2),
             (b, c, h, w),

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def __call__(self, hidden_states, causal_attention_mask):`
`66`	`66`	`for x in (q, k, v)`
`67`	`67`	`]`
`68`	`68`	`attn_output = scaled_dot_product_attention(`
`69`		`- q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask, dtype=self.dtype,`
	`69`	`+ q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask,`
`70`	`70`	`)`
`71`	`71`	`out = self.out_proj(tp.reshape(tp.transpose(attn_output, 1, 2), (bsz, tgt_len, embed_dim)))`
`72`	`72`	`return out`
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def __call__(self, x, context=None):`
`80`	`80`	`tp.transpose(tp.reshape(y, (x.shape[0], -1, self.num_heads, self.head_size)), 1, 2) for y in (q, k, v)`
`81`	`81`	`]`
`82`	`82`	`attention = tp.transpose(`
`83`		`- scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size, dtype=self.dtype), 1, 2`
	`83`	`+ scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size), 1, 2`
`84`	`84`	`)`
`85`	`85`	`h_ = tp.reshape(attention, (x.shape[0], -1, self.num_heads * self.head_size))`
`86`	`86`	`out = self.to_out(h_)`