cleanup

djsaunde · djsaunde · commit 6e45dad9de53 · 2025-10-29T12:42:45.000-04:00
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -762,16 +762,8 @@ def LlamaModel_fast_forward(
     seq_length_with_past = seq_length
 
     # Fix out of bounds tokenization unless we were given packed metadata
-    allow_overlength = getattr(self, "_unsloth_allow_packed_overlength", False) or any(
-        key in kwargs
-        for key in (
-            "cu_seq_lens_q",
-            "cu_seq_lens",
-            "cu_seqlens",
-            "max_length_q",
-            "max_seqlen",
-            "packed_seq_lengths",
-        )
+    allow_overlength = getattr(self, "_unsloth_allow_packed_overlength", False) or (
+        "packed_seq_lengths" in kwargs
     )
     if hasattr(self, "max_seq_length") and not allow_overlength:
         if seq_length > self.max_seq_length:
diff --git a/unsloth/models/qwen3.py b/unsloth/models/qwen3.py
@@ -212,6 +212,9 @@ def Qwen3Attention_fast_forward(
         # Must be contiguous or else results are False!
         # https://github.com/pytorch/pytorch/issues/112577
         Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
+        # Needs (batch_size, n_heads, seq_len, head_dim)
+        # is_casual and attention_mask must not be both set!
+        # when qlen==vlen and attn_mask is None, we should use causal attention
         Q_len = Q.shape[-2]
         K_len = K.shape[-2]
         if seq_info is not None and attention_mask is None: