Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (vllm-project#24217)

RoadToNowhereX · yewentao256 · web-flow · commit c0bd6a684aee · 2025-09-10T06:22:31.000-07:00
Signed-off-by: RoadToNowhereX &lt;37441177+RoadToNowhereX@users.noreply.github.com&gt;
Co-authored-by: Wentao Ye &lt;44945378+yewentao256@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
@@ -327,6 +327,8 @@ def apply_gptq_quant_layer(self,
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
+                return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
+            else:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)
 
@@ -339,7 +341,6 @@ def apply_gptq_quant_layer(self,
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
                     layer, prefix)
-            return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin: