fix loading of fp8 models with bf16 weight_scale (#2141)

yangulei · web-flow · commit 17a7946f471f · 2025-11-19T15:06:27.000+08:00
#2108 assumes that the dtype for `scale` or `scale_inv` is `float32`, while it might be `bfloat16` for some models like the fp8 Qwen3 dense models. Signed-off-by: Youlei Yang <youlei.yang@intel.com>
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -477,8 +477,8 @@ def safetensors_weights_iterator(
                     if param.dtype == torch.float8_e4m3fn:
                         param = (param.float() * fp8_e4m3fnuz_max /
                                  fp8_e4m3fn_max).to(torch.float8_e4m3fnuz)
-                    elif param.dtype == torch.float32 and "scale" in name.split(
-                            ".")[-1]:
+                    elif param.dtype in [torch.float32, torch.bfloat16
+                                         ] and "scale" in name.split(".")[-1]:
                         param *= fp8_e4m3fn_max / fp8_e4m3fnuz_max
                 yield name, param
 
@@ -539,8 +539,8 @@ def fastsafetensors_weights_iterator(
                         if t.dtype == torch.float8_e4m3fn:
                             t = (t.float() * fp8_e4m3fnuz_max /
                                  fp8_e4m3fn_max).to(torch.float8_e4m3fnuz)
-                        elif t.dtype == torch.float32 and "scale" in k.split(
-                                ".")[-1]:
+                        elif t.dtype in [torch.float32, torch.bfloat16
+                                         ] and "scale" in k.split(".")[-1]:
                             t *= fp8_e4m3fn_max / fp8_e4m3fnuz_max
                     yield k, t
             finally: