add attn_implementation in model config;fix deepseekv3.1-terminus load error (#130)

ali-88123 · web-flow · commit 1741e95cd9c8 · 2025-11-05T11:56:29.000+08:00
diff --git a/angelslim/engine.py b/angelslim/engine.py
@@ -75,6 +75,7 @@ def prepare_model(
         deploy_backend="vllm",
         using_multi_nodes=False,
         use_audio_in_video=False,
+        attn_implementation="default",
     ) -> Any:
         """Load pretrained model and tokenizer
         Args:
@@ -92,6 +93,8 @@ def prepare_model(
             cache_dir (str, optional): Directory to cache the model.
             deploy_backend (str): Backend for deployment, e.g., "torch", "vllm".
             using_multi_nodes (bool): Whether to use multi-nodes for calibration.
+            use_audio_in_video (bool): Whether to add audio track to a video file.
+            attn_implementation (str): The attention implementation to use in the model.
         """
         assert model_name, "model_name must be specified."
         assert model_path, "model_path must be specified."
@@ -126,6 +129,7 @@ def prepare_model(
                     device_map=device_map,
                     trust_remote_code=trust_remote_code,
                     use_audio_in_video=use_audio_in_video,
+                    attn_implementation=attn_implementation,
                 )
                 self.model_path = model_path
         else:
diff --git a/angelslim/models/llm/modeling_deepseek.py b/angelslim/models/llm/modeling_deepseek.py
@@ -27,7 +27,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from safetensors.torch import load_model, safe_open, save_file
+from safetensors.torch import load_file, load_model, safe_open, save_file
 from torch import nn
 from tqdm import tqdm, trange
 from transformers.generation import GenerationMixin
@@ -1037,12 +1037,28 @@ def from_pretrained(
                 dist.barrier()
             with torch.device("cuda"):
                 model = cls(config)
-            load_model(
-                model,
-                os.path.join(
+            try:
+                load_model(
+                    model,
+                    os.path.join(
+                        tp_model_path, f"model{rank}-mp{cls.world_size}.safetensors"
+                    ),
+                )
+            except RuntimeError:
+                file_path = os.path.join(
                     tp_model_path, f"model{rank}-mp{cls.world_size}.safetensors"
-                ),
-            )
+                )
+                file_state_dict = load_file(file_path)
+                model_state_dict = model.state_dict()
+                for key in model_state_dict:
+                    if (
+                        key in file_state_dict
+                        and file_state_dict[key].dtype != model_state_dict[key].dtype
+                    ):
+                        file_state_dict[key] = file_state_dict[key].to(
+                            model_state_dict[key].dtype
+                        )
+                model.load_state_dict(file_state_dict, strict=False)
             return model
         return super().from_pretrained(
             model_path,
diff --git a/angelslim/models/omni/qwen3_omni.py b/angelslim/models/omni/qwen3_omni.py
@@ -47,14 +47,22 @@ def from_pretrained(
         device_map="auto",
         trust_remote_code=True,
         use_audio_in_video=False,
+        attn_implementation="default",
     ):
         self.use_audio_in_video = use_audio_in_video
-        self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
-            model_path,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            attn_implementation="flash_attention_2",
-        )
+        if attn_implementation == "default":
+            self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                device_map=device_map,
+            )
+        else:
+            self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                device_map=device_map,
+                attn_implementation=attn_implementation,
+            )
 
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
@@ -121,6 +121,8 @@ class ModelConfig:
         low_cpu_mem_usage: Use low memory loading for large models
         use_cache: Whether to use cache during model loading
         cache_dir: Directory for caching model files
+        use_audio_in_video: Whether to add audio track to a video file
+        attn_implementation: The attention implementation to use in the model
     """
 
     name: str
@@ -132,6 +134,7 @@ class ModelConfig:
     use_cache: bool = field(default=False)
     cache_dir: Optional[str] = field(default=None)
     use_audio_in_video: bool = field(default=False)
+    attn_implementation: str = field(default="default")
 
 
 @dataclass
diff --git a/configs/qwen3_omni/fp8_dynamic/qwen3_omni_fp8_dynamic.yaml b/configs/qwen3_omni/fp8_dynamic/qwen3_omni_fp8_dynamic.yaml
@@ -12,6 +12,7 @@ model:
   torch_dtype: auto
   device_map: auto
   use_audio_in_video: false
+  attn_implementation: default
 
 # Compression configuration
 compression:
diff --git a/configs/qwen3_omni/fp8_static/qwen3_omni_fp8_static.yaml b/configs/qwen3_omni/fp8_static/qwen3_omni_fp8_static.yaml
@@ -12,6 +12,7 @@ model:
   torch_dtype: auto
   device_map: auto
   use_audio_in_video: false
+  attn_implementation: default
 
 # Compression configuration
 compression:
diff --git a/docs/source/models/qwen3_omni/qwen3_omni_quant.md b/docs/source/models/qwen3_omni/qwen3_omni_quant.md
@@ -15,6 +15,7 @@ FP8量化的配置文件可参考路径：`configs/qwen3_omni/fp8_static` 和 `c
 - `name`：模型名称，固定填写`Qwen_Omni`。
 - `model_path`：可填写hugging face模型卡片名称或者本地路径。
 - `use_audio_in_video`: 用于控制是否使用源视频的音频轨道
+- `attn_implementation`: 模型中要使用的注意力实现，默认值为`default`，设为`flash_attention_2`可以降低GPU显存占用
 
 #### compression配置
 - `name`：压缩策略类型，固定选择量化模式`PTQ`。
@@ -28,6 +29,14 @@ FP8量化的配置文件可参考路径：`configs/qwen3_omni/fp8_static` 和 `c
 
 ### 启动量化流程
 
+若在`model`配置中设置了`attn_implementation`为`flash_attention_2`，需要另外安装`FlashAttention 2`：
+```shell
+pip install -U flash-attn --no-build-isolation
+
+# ldd --version 如果 < 2.32，可降到 2.7.4.post1 以下版本
+pip install flash-attn==2.7.4.post1 --no-build-isolation
+```
+
 通过以下命令启动FP8量化校准：
 
 ```shell
diff --git a/tools/run.py b/tools/run.py
@@ -91,6 +91,7 @@ def multi_nodes_run(config):
         use_cache=model_config.use_cache,
         cache_dir=model_config.cache_dir,
         use_audio_in_video=model_config.use_audio_in_video,
+        attn_implementation=model_config.attn_implementation,
         deploy_backend=global_config.deploy_backend,
         using_multi_nodes=True,
     )
@@ -151,6 +152,7 @@ def run(config):
         use_cache=model_config.use_cache,
         cache_dir=model_config.cache_dir,
         use_audio_in_video=model_config.use_audio_in_video,
+        attn_implementation=model_config.attn_implementation,
         deploy_backend=global_config.deploy_backend,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@ def multi_nodes_run(config):`
`91`	`91`	`use_cache=model_config.use_cache,`
`92`	`92`	`cache_dir=model_config.cache_dir,`
`93`	`93`	`use_audio_in_video=model_config.use_audio_in_video,`
	`94`	`+ attn_implementation=model_config.attn_implementation,`
`94`	`95`	`deploy_backend=global_config.deploy_backend,`
`95`	`96`	`using_multi_nodes=True,`
`96`	`97`	`)`
`@@ -151,6 +152,7 @@ def run(config):`
`151`	`152`	`use_cache=model_config.use_cache,`
`152`	`153`	`cache_dir=model_config.cache_dir,`
`153`	`154`	`use_audio_in_video=model_config.use_audio_in_video,`
	`155`	`+ attn_implementation=model_config.attn_implementation,`
`154`	`156`	`deploy_backend=global_config.deploy_backend,`
`155`	`157`	`)`
`156`	`158`