[XPU] Add flash_attn2 support for XPU (#41956)

YangKai0616 · vasqu · web-flow · commit 07bfd2f8ecd0 · 2025-11-21T16:43:48.000Z
* Add flash_attention_2 and kernels-community/flash-attn support for XPU

* Add flash-attn-2 support for XPU

* Delete deterministic algorithm for xpu

* Fix code style

* Modify repo_id to match the latest kernels-community/flash-attn2

* Fix code style

* Update

* Make quality

* Use kernels loading

* Update

* Delete invalid import

* Update comment

---------

Co-authored-by: Anton Vlasjuk &lt;73884904+vasqu@users.noreply.github.com&gt;
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -24,6 +24,7 @@
     is_flash_attn_3_available,
     is_flash_attn_greater_or_equal_2_10,
     is_torch_npu_available,
+    is_torch_xpu_available,
     logging,
 )
 
@@ -45,7 +46,12 @@ def flash_attn_supports_top_left_mask():
 
 # TODO Deprecate when all models have the attention interface
 def is_flash_attn_available():
-    return is_flash_attn_3_available() or is_flash_attn_2_available() or is_torch_npu_available()
+    return (
+        is_flash_attn_3_available()
+        or is_flash_attn_2_available()
+        or is_torch_npu_available()
+        or is_torch_xpu_available()
+    )
 
 
 # `globals()` is not compatible with dynamo, hence we have do define them in global scope ourselves
@@ -97,7 +103,7 @@ def _lazy_imports(implementation: Optional[str]):
             if flash_attn_varlen_func is None or flash_attn_func is None:
                 raise ValueError(
                     f"Could not find the currently requested flash attention implementation at `{implementation}`."
-                    f"Make sure that you request a valid kernel from the hub, e.g. `kernels-community/flash-attn`."
+                    f"Make sure that you request a valid kernel from the hub, e.g. `kernels-community/flash-attn2`."
                 )
 
     return flash_attn_func, flash_attn_varlen_func, pad_input, unpad_input
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -116,6 +116,7 @@
     is_torch_greater_or_equal,
     is_torch_mlu_available,
     is_torch_npu_available,
+    is_torch_xpu_available,
     logging,
 )
 from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder
@@ -1575,6 +1576,10 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
                 logger.info("Detect using FlashAttention2 on Ascend NPU.")
                 return True
 
+            if is_torch_xpu_available():
+                logger.info("Detect using FlashAttention2 (via kernel `kernels-community/flash-attn2`) on XPU.")
+                return True
+
             if importlib.util.find_spec("flash_attn") is None:
                 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
             else:
@@ -1800,7 +1805,10 @@ def _check_and_adjust_attn_implementation(
             and not is_torch_npu_available()
         ):
             if attn_implementation.endswith("2"):
-                applicable_attn_implementation = "kernels-community/flash-attn"
+                applicable_attn_implementation = "kernels-community/flash-attn2"
+                if is_torch_xpu_available():
+                    # On XPU, kernels library is the native implementation. Rename variable to avoid "fallback" warning and irrelevant checks.
+                    attn_implementation = "kernels-community/flash-attn2"
             else:
                 applicable_attn_implementation = "kernels-community/vllm-flash-attn3"
 
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -593,7 +593,7 @@ def require_flash_attn(test_case):
     try:
         from kernels import get_kernel
 
-        get_kernel("kernels-community/flash-attn")
+        get_kernel("kernels-community/flash-attn2")
     except Exception as _:
         kernels_available = False
 
diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py
@@ -24,7 +24,7 @@
     _COMMON_MODEL_NAMES_MAP,
     is_flaky,
     require_flash_attn,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
 )
 
@@ -550,7 +550,7 @@ def test_model_rope_scaling_frequencies(self):
             torch.testing.assert_close(yarn_sin_long, original_sin_long)
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @is_flaky()
     @slow
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -1848,7 +1848,7 @@ def test_eager_matches_sdpa_generate(self):
 
     @pytest.mark.flash_attn_test
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_eager_matches_fa2_generate(self):
         """Tests that generate has equivalent outputs with FA2 and eager attention implementations."""
@@ -1863,7 +1863,7 @@ def test_eager_matches_fa3_generate(self):
         self._test_attention_implementation("flash_attention_3")
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     def test_flash_attention_2_continue_generate_with_position_ids(self):
         """
@@ -2065,14 +2065,14 @@ def test_sdpa_padding_matches_padding_free_with_position_ids(self):
         self.attention_mask_padding_matches_padding_free_with_position_ids(attn_implementation="sdpa")
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         self.attention_mask_padding_matches_padding_free_with_position_ids(attn_implementation="flash_attention_2")
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
@@ -34,7 +34,6 @@
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -444,7 +443,7 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa
         pass
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @mark.flash_attn_test
     @slow
     @unittest.skip(
diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py
@@ -29,7 +29,6 @@
     require_read_token,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -324,7 +323,7 @@ def _reinitialize_config(base_config, new_kwargs):
             )  # missing "factor"
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @pytest.mark.flash_attn_test
     @require_read_token
@@ -364,7 +363,7 @@ def test_flash_attn_2_generate_padding_right(self):
         self.assertListEqual(output_native, output_fa_2)
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     @pytest.mark.flash_attn_test
     def test_use_flash_attention_2_true(self):
@@ -379,7 +378,7 @@ def test_use_flash_attention_2_true(self):
 
                 new_model = DiffLlamaForCausalLM.from_pretrained(
                     tmp_dir, attn_implementation="flash_attention_2", dtype=torch.float16
-                ).to("cuda")
+                ).to(torch_device)
 
                 self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
 
diff --git a/tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py b/tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py
@@ -25,7 +25,7 @@
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     require_torch_large_accelerator,
     require_torch_multi_accelerator,
     slow,
@@ -56,7 +56,7 @@ class Ernie4_5_MoeModelTest(CausalLMModelTest, unittest.TestCase):
     model_tester_class = Ernie4_5_MoeModelTester
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @is_flaky()
     @slow
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
@@ -25,7 +25,7 @@
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -306,7 +306,7 @@ def test_resize_tokens_embeddings(self):
         pass
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @is_flaky()
     @slow
diff --git a/tests/models/glm4/test_modeling_glm4.py b/tests/models/glm4/test_modeling_glm4.py
@@ -25,7 +25,6 @@
     require_flash_attn,
     require_torch,
     require_torch_large_accelerator,
-    require_torch_large_gpu,
     slow,
     torch_device,
 )
@@ -177,7 +176,7 @@ def test_model_9b_sdpa(self):
         self.assertEqual(output_text, EXPECTED_TEXT)
 
     @require_flash_attn
-    @require_torch_large_gpu
+    @require_torch_large_accelerator
     @pytest.mark.flash_attn_test
     def test_model_9b_flash_attn(self):
         EXPECTED_TEXTS = Expectations(
@@ -187,6 +186,10 @@ def test_model_9b_flash_attn(self):
                     "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
                     "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
                 ],
+                ("xpu", None): [
+                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
+                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
+                ],
             }
         )
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
@@ -22,7 +22,7 @@
     cleanup,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -410,7 +410,7 @@ def test_contrastive_search_gpt2(self):
         )
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_padding_left(self):
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
@@ -28,7 +28,7 @@
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -513,7 +513,7 @@ def test_attention_outputs(self):
             )
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @pytest.mark.flash_attn_test
     @slow
diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -22,7 +22,7 @@
     cleanup,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -107,7 +107,7 @@ class JetMoeModelTest(CausalLMModelTest, unittest.TestCase):
     model_tester_class = JetMoeModelTester
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
@@ -32,6 +32,7 @@
 from transformers.testing_utils import (
     require_flash_attn,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_vision,
     slow,
@@ -627,7 +628,7 @@ def test_sdpa(self):
         self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_FA2(self):
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -26,8 +26,8 @@
     require_sentencepiece,
     require_tokenizers,
     require_torch,
+    require_torch_accelerator,
     require_torch_fp16,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -418,7 +418,7 @@ def test_seq_to_seq_generation(self):
         assert generated == expected_en
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_seq_to_seq_generation(self):
diff --git a/tests/models/ministral/test_modeling_ministral.py b/tests/models/ministral/test_modeling_ministral.py
@@ -27,7 +27,7 @@
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -69,7 +69,7 @@ def is_pipeline_test_to_skip(
         return True
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
@@ -23,7 +23,6 @@
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -63,7 +62,7 @@ def is_pipeline_test_to_skip(
         return True
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
@@ -26,6 +26,7 @@
     CaptureLogger,
     require_flash_attn,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     slow,
     torch_device,
@@ -366,7 +367,7 @@ def test_saved_config_excludes_reference_compile(self):
             self.assertNotIn("reference_compile", config_dict)
 
     @require_flash_attn
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.flash_attn_test
     def test_flash_attention_dispatches_by_default(self):
         "ModernBert should dispatch to FA2 by default, not SDPA"
diff --git a/tests/models/seed_oss/test_modeling_seed_oss.py b/tests/models/seed_oss/test_modeling_seed_oss.py
@@ -23,7 +23,6 @@
     require_flash_attn,
     require_torch,
     require_torch_large_accelerator,
-    require_torch_large_gpu,
     slow,
     torch_device,
 )
@@ -106,7 +105,7 @@ def test_model_36b_sdpa(self):
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @require_flash_attn
-    @require_torch_large_gpu
+    @require_torch_large_accelerator
     @pytest.mark.flash_attn_test
     def test_model_36b_flash_attn(self):
         EXPECTED_TEXTS = [
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py