diff --git a/olive/passes/quark_quantizer/torch/quark_torch_quantization.py b/olive/passes/quark_quantizer/torch/quark_torch_quantization.py index 63e5e0a4a..6bb105b41 100644 --- a/olive/passes/quark_quantizer/torch/quark_torch_quantization.py +++ b/olive/passes/quark_quantizer/torch/quark_torch_quantization.py @@ -63,12 +63,13 @@ def run_quark_torch_quantization( # 1. Load model logger.info("[INFO] Loading model from: %s", model.model_path) + is_gpu = torch.cuda.is_available() torch_model, _ = get_model( str(model.model_path), config.data_type, device, - multi_gpu=True, - multi_device=True, + multi_gpu=is_gpu, + multi_device=is_gpu, attn_implementation="eager", trust_remote_code=config.trust_remote_code, ) @@ -136,7 +137,7 @@ def run_quark_torch_quantization( # 4. Quantize model logger.info("[INFO] Starting model quantization") - quantizer = ModelQuantizer(quant_config, multi_device=True) + quantizer = ModelQuantizer(quant_config, multi_device=is_gpu) torch_model = quantizer.quantize_model(torch_model, calib_dataloader) # 5. Freeze model