From 79013a4dc51a0e35cb6721ca62f6640f5782a5bf Mon Sep 17 00:00:00 2001 From: Qubitium Date: Tue, 14 Oct 2025 07:33:29 +0000 Subject: [PATCH 01/34] fully deprecate autogptq --- docs/source/ar/llm_tutorial.md | 2 +- docs/source/ar/llm_tutorial_optimization.md | 2 +- docs/source/en/llm_optims.md | 2 +- docs/source/en/llm_tutorial_optimization.md | 2 +- docs/source/en/quantization/gptq.md | 34 ++--- docs/source/en/quantization/overview.md | 3 +- docs/source/ja/main_classes/quantization.md | 10 +- docs/source/ko/llm_optims.md | 2 +- docs/source/ko/llm_tutorial_optimization.md | 2 +- docs/source/ko/model_doc/llama2.md | 2 +- docs/source/ko/quantization/gptq.md | 8 +- docs/source/zh/llm_tutorial.md | 2 +- docs/source/zh/main_classes/quantization.md | 10 +- src/transformers/quantizers/quantizer_gptq.py | 27 +--- src/transformers/testing_utils.py | 7 +- src/transformers/utils/__init__.py | 1 - src/transformers/utils/import_utils.py | 5 - src/transformers/utils/quantization_config.py | 34 ++--- tests/quantization/gptq/test_gptq.py | 132 ++++++++---------- 19 files changed, 105 insertions(+), 182 deletions(-) diff --git a/docs/source/ar/llm_tutorial.md b/docs/source/ar/llm_tutorial.md index cf905db9c949..6d6cbfdf9020 100644 --- a/docs/source/ar/llm_tutorial.md +++ b/docs/source/ar/llm_tutorial.md @@ -238,7 +238,7 @@ LLMs هي [معماريات فك التشفير فقط](https://huggingface.co/l ### زمن الاستجابة والإنتاجية واستهلاك الذاكرة 1. دليل تحسين نماذج اللغات الكبيرة من حيث السرعة والذاكرة: دليل تحسين نماذج اللغات الكبيرة. -2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و autogptq، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير. +2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و GPT-QModel، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير. ### مكتبات مرتبطة 1. [`optimum`](https://github.com/huggingface/optimum), امتداد لمكتبة Transformers يعمل على تحسين الأداء لأجهزة معينة. diff --git a/docs/source/ar/llm_tutorial_optimization.md b/docs/source/ar/llm_tutorial_optimization.md index fca34aab0ddc..bd0bdfc7fae6 100644 --- a/docs/source/ar/llm_tutorial_optimization.md +++ b/docs/source/ar/llm_tutorial_optimization.md @@ -273,7 +273,7 @@ flush() يسمح تكميم 4 بت بتشغيل النموذج على وحدات معالجة الرسومات مثل RTX3090 و V100 و T4 والتي يمكن الوصول إليها بسهولة لمعظم الأشخاص. -لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60). +لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel). > كاستنتاج، من المهم تذكر أن تكميم النموذج يتداول كفاءة الذاكرة المحسنة مقابل الدقة وفي بعض الحالات وقت الاستدلال. diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 92961d2de5ef..b0376960f9d0 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -360,7 +360,7 @@ Quantization reduces the size of model weights by storing them in a lower precis If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] -> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. +> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and GPT-QModel. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post for a comparison of different approaches. Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating the memory required to load [Mistral-7B-v0.1](https://hf.co/mistralai/Mistral-7B-v0.1). diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 6eb5cc747b6e..f0e5db09a7d1 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -286,7 +286,7 @@ Overall, we saw that running OctoCoder in 8-bit precision reduced the required G 4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people. -For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) implementation. +For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel) implementation. > As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time. diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index a9878bbc362e..812d13396c7b 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -16,10 +16,9 @@ rendered properly in your Markdown viewer. # GPTQ -The [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate. +The [GPT-QModel](https://github.com/ModelCloud/GPTQModel) project (Python package `gptqmodel`) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate. -> [!WARNING] -> AutoGPTQ is likely to be deprecated in the future due to lack of continued support for new models and features. See the [GPTQModel](#gptqmodel) section for more details. +AutoGPTQ is no longer supported in Transformers. Install GPT-QModel] instead. Install Accelerate, Transformers and Optimum first. @@ -27,25 +26,12 @@ Install Accelerate, Transformers and Optimum first. pip install --upgrade accelerate optimum transformers ``` -Then run the command below to install a GPTQ library. - - - +Then run the command below to install GPT-QModel]. ```bash pip install gptqmodel --no-build-isolation ``` - - - -```bash -pip install auto-gptq --no-build-isolation -``` - - - - Create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calbrate the weights for quantization, and a tokenizer to prepare the dataset. ```py @@ -58,7 +44,7 @@ gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer) You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper. ```py -dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."] gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) ``` @@ -142,7 +128,7 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ 0.4.2+, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file. ```py import torch @@ -156,16 +142,16 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` -## GPTQModel +## GPT-QModel] -It is recommended to use GPTQModel, originally a maintained fork of AutoGPTQ, because it has since diverged from AutoGTPQ with some significant features. GPTQModel has faster quantization, lower memory usage, and more accurate default quantization. +GPT-QModel] is the actively maintained backend for GPTQ in Transformers. It was originally forked from AutoGPTQ, but has since diverged with significant improvements such as faster quantization, lower memory usage, and more accurate defaults. -GPTQModel provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with AutoGPTQ, and not all kernels (Marlin) support asymmetric quantization. +GPT-QModel] provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with legacy AutoGPTQ checkpoints, and not all kernels (Marlin) support asymmetric quantization. -GPTQModel also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.). +GPT-QModel] also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.). The Marlin kernels are also updated for A100 GPUs and other kernels are updated to include auto-padding for legacy models and models with non-uniform in/out-features. ## Resources -Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience, and read [Making LLMs lighter with AutoGPTQ and transformers](https://huggingface.co/blog/gptq-integration) to learn more about the AutoGPTQ integration. +Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience. diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 0a8dee1e33ae..1f1c03d7393b 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -32,8 +32,7 @@ Use the Space below to help you pick a quantization method depending on your har | [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | | [FP-Quant](./fp_quant) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 4 | 🔴 | 🟢 | 🟢 | https://github.com/IST-DASLab/FP-Quant | | [GGUF / GGML (llama.cpp)](../gguf) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 🔴 | 1/8 | 🔴 | [See Notes](../gguf) | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQModel](./gptq) | 🔴 | 🟢 | 🟢 | 🟢 | 🟢 | 🟢 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | -| [AutoGPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | +| [GPT-QModel](./gptq) | 🔴 | 🟢 | 🟢 | 🟢 | 🟢 | 🟢 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | | [HIGGS](./higgs) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | | [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | | [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | diff --git a/docs/source/ja/main_classes/quantization.md b/docs/source/ja/main_classes/quantization.md index 2ef8c6ca683a..d7f2776d5e52 100644 --- a/docs/source/ja/main_classes/quantization.md +++ b/docs/source/ja/main_classes/quantization.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Quantize 🤗 Transformers models -## `AutoGPTQ` Integration +## GPT-QModel Integration 🤗 Transformers には、言語モデルで GPTQ 量子化を実行するための `optimum` API が統合されています。パフォーマンスを大幅に低下させることなく、推論速度を高速化することなく、モデルを 8、4、3、さらには 2 ビットでロードおよび量子化できます。これは、ほとんどの GPU ハードウェアでサポートされています。 @@ -24,14 +24,14 @@ rendered properly in your Markdown viewer. 量子化モデルの詳細については、以下を確認してください。 - [GPTQ](https://huggingface.co/papers/2210.17323) 論文 - GPTQ 量子化に関する `optimum` [ガイド](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization) -- バックエンドとして使用される [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) ライブラリ +- バックエンドとして使用される `GPT-QModel` (https://github.com/ModelCloud/GPTQModel) ライブラリ ### Requirements 以下のコードを実行するには、以下の要件がインストールされている必要があります: -- 最新の `AutoGPTQ` ライブラリをインストールする。 -`pip install auto-gptq` をインストールする。 +- 最新の `GPT-QModel` ライブラリをインストールする。 +`pip install gptqmodel --no-build-isolation` を実行する。 - 最新の `optimum` をソースからインストールする。 `git+https://github.com/huggingface/optimum.git` をインストールする。 @@ -63,7 +63,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer) 独自のデータセットを文字列のリストとして渡すことができることに注意してください。ただし、GPTQ 論文のデータセットを使用することを強くお勧めします。 ```python -dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."] quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer) ``` diff --git a/docs/source/ko/llm_optims.md b/docs/source/ko/llm_optims.md index b264e5f710f6..b2031bf3776b 100644 --- a/docs/source/ko/llm_optims.md +++ b/docs/source/ko/llm_optims.md @@ -372,7 +372,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable 양자화는 LLM 가중치를 더 낮은 정밀도로 저장하여 크기를 줄입니다. 이는 메모리 사용량을 줄이며 GPU 메모리에 제약이 있는 경우 추론을 위해 LLM을 로드하는 것을 더 용이하게 합니다. GPU가 충분하다면, 모델을 양자화할 필요는 없습니다. 추가적인 양자화 및 양자화 해제 단계로 인해 약간의 지연이 발생할 수 있기 때문입니다(AWQ 및 융합 AWQ 모듈 제외). > [!TIP] -> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다. +> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 GPT-QModel이 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 gptqmodel과 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다. 아래의 모델 메모리 계산기를 사용하여 모델을 로드하는 데 필요한 메모리를 추정하고 비교해 보십시오. 예를 들어 [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)를 로드하는 데 필요한 메모리를 추정해 보십시오. diff --git a/docs/source/ko/llm_tutorial_optimization.md b/docs/source/ko/llm_tutorial_optimization.md index d4ea10735ca3..0d6033a47477 100644 --- a/docs/source/ko/llm_tutorial_optimization.md +++ b/docs/source/ko/llm_tutorial_optimization.md @@ -269,7 +269,7 @@ flush() 4비트 양자화는 RTX3090, V100, T4와 같은 GPU에서 모델을 실행할 수 있게 해주며, 이는 대부분의 사람들이 접근할 수 있는 GPU입니다. -양자화에 대한 더 많은 정보를 확인하고 4비트보다 더 적은 GPU VRAM 메모리로 모델을 양자화하거나, 더 많은 양자화 관련 정보를 보려면 [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) 구현을 참조하는 것을 추천합니다. +양자화에 대한 더 많은 정보를 확인하고 4비트보다 더 적은 GPU VRAM 메모리로 모델을 양자화하거나, 더 많은 양자화 관련 정보를 보려면 [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel) 구현을 참조하는 것을 추천합니다. > 결론적으로, 모델 양자화는 향상된 메모리 효율성과 모델 정확성 간의 균형을 맞추는 것이며, 경우에 따라 추론 시간에도 영향을 미칠 수 있습니다. diff --git a/docs/source/ko/model_doc/llama2.md b/docs/source/ko/model_doc/llama2.md index 6fd74861be6d..85658e4535a9 100644 --- a/docs/source/ko/model_doc/llama2.md +++ b/docs/source/ko/model_doc/llama2.md @@ -82,7 +82,7 @@ LLaMA2를 시작하는 데 도움이 될 Hugging Face의 공식 및 커뮤니티 - 개인 컴퓨터에서 QLoRA와 TRL을 사용하여 Llama 2 모델을 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1SYpgFpcmtIUzdE7pxqknrM4ArCASfkFQ?usp=sharing)입니다. 🌎 ⚡️ 추론 -- AutoGPTQ 라이브러리의 GPTQ를 사용하여 Llama 2 모델을 양자화하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing)입니다. 🌎 +- GPT-QModel 라이브러리의 GPTQ를 사용하여 Llama 2 모델을 양자화하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing)입니다. 🌎 - 로컬 컴퓨터나 Google Colab에서 4-bit 양자화로 Llama 2 채팅 모델을 실행하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1X1z9Q6domMKl2CnEM0QGHNwidLfR4dW2?usp=sharing)입니다. 🌎 🚀 배포 diff --git a/docs/source/ko/quantization/gptq.md b/docs/source/ko/quantization/gptq.md index c54f09c94a33..ac8c5f62adc4 100644 --- a/docs/source/ko/quantization/gptq.md +++ b/docs/source/ko/quantization/gptq.md @@ -22,12 +22,12 @@ PEFT를 활용한 GPTQ 양자화를 사용해보시려면 이 [노트북](https: -[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) 라이브러리는 GPTQ 알고리즘을 구현합니다. 이는 훈련 후 양자화 기법으로, 가중치 행렬의 각 행을 독립적으로 양자화하여 오차를 최소화하는 가중치 버전을 찾습니다. 이 가중치는 int4로 양자화되지만, 추론 중에는 실시간으로 fp16으로 복원됩니다. 이는 int4 가중치가 GPU의 전역 메모리 대신 결합된 커널에서 역양자화되기 때문에 메모리 사용량을 4배 절약할 수 있으며, 더 낮은 비트 너비를 사용함으로써 통신 시간이 줄어들어 추론 속도가 빨라질 것으로 기대할 수 있습니다. +[GPT-QModel](https://github.com/ModelCloud/GPTQModel) 라이브러리는 GPTQ 알고리즘을 구현합니다. 이는 훈련 후 양자화 기법으로, 가중치 행렬의 각 행을 독립적으로 양자화하여 오차를 최소화하는 가중치 버전을 찾습니다. 이 가중치는 int4로 양자화되지만, 추론 중에는 실시간으로 fp16으로 복원됩니다. 이는 int4 가중치가 GPU의 전역 메모리 대신 결합된 커널에서 역양자화되기 때문에 메모리 사용량을 4배 절약할 수 있으며, 더 낮은 비트 너비를 사용함으로써 통신 시간이 줄어들어 추론 속도가 빨라질 것으로 기대할 수 있습니다. 시작하기 전에 다음 라이브러리들이 설치되어 있는지 확인하세요: ```bash -pip install auto-gptq +pip install gptqmodel --no-build-isolation pip install --upgrade accelerate optimum transformers ``` @@ -44,7 +44,7 @@ gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer) 자신의 데이터셋을 문자열 리스트 형태로 전달할 수도 있지만, GPTQ 논문에서 사용한 동일한 데이터셋을 사용하는 것을 강력히 권장합니다. ```py -dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."] gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) ``` @@ -110,7 +110,7 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de -ExLlama 커널은 전체 모델이 GPU에 있을 때만 지원됩니다. AutoGPTQ(버전 0.4.2 이상)로 CPU에서 추론을 수행하는 경우 ExLlama 커널을 비활성화해야 합니다. 이를 위해 config.json 파일의 양자화 설정에서 ExLlama 커널과 관련된 속성을 덮어써야 합니다. +ExLlama 커널은 전체 모델이 GPU에 있을 때만 지원됩니다. CPU에서 추론을 수행하는 경우 [`GPTQConfig`]에서 ExLlama 커널을 비활성화해야 합니다. 이를 위해 config.json 파일의 양자화 설정에서 ExLlama 커널과 관련된 속성을 덮어써야 합니다. ```py import torch diff --git a/docs/source/zh/llm_tutorial.md b/docs/source/zh/llm_tutorial.md index 19e3a9ce7767..1e1b4207ef70 100644 --- a/docs/source/zh/llm_tutorial.md +++ b/docs/source/zh/llm_tutorial.md @@ -261,7 +261,7 @@ LLMs是[仅解码器](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) ### 延迟、吞吐量和内存利用率 1. [指南](llm_tutorial_optimization),如何优化LLMs以提高速度和内存利用; -2. [指南](main_classes/quantization), 关于`quantization`,如bitsandbytes和autogptq的指南,教您如何大幅降低内存需求。 +2. [指南](main_classes/quantization), 关于`quantization`,如bitsandbytes和GPT-QModeldel的指南,教您如何大幅降低内存需求。 ### 相关库 diff --git a/docs/source/zh/main_classes/quantization.md b/docs/source/zh/main_classes/quantization.md index 262558654341..e0122e3a9bdd 100644 --- a/docs/source/zh/main_classes/quantization.md +++ b/docs/source/zh/main_classes/quantization.md @@ -113,22 +113,22 @@ model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", att [[autodoc]] AwqConfig -## `AutoGPTQ` 集成 +## GPT-QModel 集成 🤗 Transformers已经整合了`optimum` API,用于对语言模型执行GPTQ量化。您可以以8、4、3甚至2位加载和量化您的模型,而性能无明显下降,并且推理速度更快!这受到大多数GPU硬件的支持。 要了解更多关于量化模型的信息,请查看: - [GPTQ](https://huggingface.co/papers/2210.17323)论文 - `optimum`关于GPTQ量化的[指南](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization) -- 用作后端的[`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ)库 +- 用作后端的`GPT-QModel` (https://github.com/ModelCloud/GPTQModel)库 ### 要求 为了运行下面的代码,您需要安装: -- 安装最新版本的 `AutoGPTQ` 库 -`pip install auto-gptq` +- 安装最新版本的 `GPT-QModel` 库 +`pip install gptqmodel --no-build-isolation` - 从源代码安装最新版本的`optimum` `pip install git+https://github.com/huggingface/optimum.git` @@ -162,7 +162,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer) ```python -dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."] quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer) ``` diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index f12ad4ca7e94..a11f2ed2f7eb 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -22,7 +22,7 @@ if TYPE_CHECKING: from ..modeling_utils import PreTrainedModel -from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging +from ..utils import is_gptqmodel_available, is_optimum_available, is_torch_available, logging from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin @@ -35,11 +35,12 @@ class GptqHfQuantizer(HfQuantizer): """ Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through - `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model. + the GPT-QModel package (Python import name `gptqmodel`). Quantization is done under the hood for users if they + load a non-prequantized model. """ requires_calibration = False - required_packages = ["optimum", "auto_gptq", "gptqmodel"] + required_packages = ["optimum", "gptqmodel"] optimum_quantizer = None def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): @@ -54,24 +55,13 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): def validate_environment(self, *args, **kwargs): if not is_optimum_available(): raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)") - if is_auto_gptq_available() and is_gptqmodel_available(): - logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel") - gptq_supports_cpu = ( - is_auto_gptq_available() - and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") - ) or is_gptqmodel_available() + gptq_supports_cpu = is_gptqmodel_available() if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("GPU is required to quantize or run quantize model.") - elif not (is_auto_gptq_available() or is_gptqmodel_available()): + elif not is_gptqmodel_available(): raise ImportError( - "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. " - ) - elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse( - "0.4.2" - ): - raise ImportError( - "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`." + "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) library." ) elif is_gptqmodel_available() and ( version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3") @@ -90,9 +80,6 @@ def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype": def update_device_map(self, device_map): if device_map is None: device_map = {"": torch.device("cpu")} - # Only with auto-gptq do not support CPU, we should move the model to cuda if available. - if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}): - device_map = {"": 0} return device_map def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 89e5a9700739..6bb914124561 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -72,7 +72,6 @@ is_apollo_torch_available, is_aqlm_available, is_auto_awq_available, - is_auto_gptq_available, is_auto_round_available, is_av_available, is_bitsandbytes_available, @@ -1288,11 +1287,9 @@ def require_tensorboard(test_case): def require_gptq(test_case): """ - Decorator for auto_gptq dependency + Decorator for gptqmodel dependency """ - return unittest.skipUnless( - is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq" - )(test_case) + return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case) def require_hqq(test_case): diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 82a9e3a85bd1..2510bdb71840 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -122,7 +122,6 @@ is_apollo_torch_available, is_aqlm_available, is_auto_awq_available, - is_auto_gptq_available, is_auto_round_available, is_av_available, is_bitsandbytes_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index a956efc97fdb..a8cee6a69863 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -983,11 +983,6 @@ def is_compressed_tensors_available() -> bool: return _is_package_available("compressed_tensors") -@lru_cache -def is_auto_gptq_available() -> bool: - return _is_package_available("auto_gptq") - - @lru_cache def is_gptqmodel_available() -> bool: return _is_package_available("gptqmodel") diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index f1bb9da8c202..a12731d752b8 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -37,9 +37,6 @@ is_torchao_available, logging, ) -from .import_utils import is_auto_gptq_available - - if is_torch_available(): import torch @@ -632,7 +629,7 @@ class ExllamaVersion(int, Enum): class GPTQConfig(QuantizationConfigMixin): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum` api for gptq quantization relying on auto_gptq backend. + loaded using `optimum` api for GPTQ quantization relying on the gptqmodel backend. Args: bits (`int`): @@ -660,15 +657,15 @@ class GPTQConfig(QuantizationConfigMixin): the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers. checkpoint_format (`str`, *optional*, defaults to `"gptq"`): - GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. + GPTQ weight format. `gptq` (v1) is supported by gptqmodel. `gptq_v2` is gptqmodel only. meta (`dict[str, any]`, *optional*): Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] backend (`str`, *optional*): - Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only - valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py + Controls which kernel to use. Valid values for gptqmodel are `auto`, `auto_trainable` and more. Ref gptqmodel backends: + https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py use_cuda_fp16 (`bool`, *optional*, defaults to `False`): - Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only. + Whether or not to use optimized CUDA kernels for fp16 models. Need to have model in fp16. model_seqlen (`int`, *optional*): The maximum sequence length that the model can take. block_name_to_quantize (`str`, *optional*): @@ -789,17 +786,10 @@ def post_init(self): ['wikitext2','c4','c4-new'], but we found {self.dataset}""" ) - # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial) - if is_gptqmodel_available(): - # convert auto-gptq control into gptqmodel backend - if self.backend is None: - self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto" - else: - # convert gptqmodel backend `auto_trainable` into auto-gptq control - if self.backend == "auto_trainable": - self.use_exllama = False + # make sure backend default stays consistent with gptqmodel expectations + if is_gptqmodel_available() and self.backend is None: + self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto" - # auto-gptq specific kernel control logic if self.use_exllama is None: # New default behaviour self.use_exllama = True @@ -821,14 +811,6 @@ def post_init(self): "You have activated exllama backend. Note that you can get better inference " "speed using exllamav2 kernel by setting `exllama_config`." ) - elif self.exllama_config["version"] == ExllamaVersion.TWO: - if is_auto_gptq_available(): - optimum_version = version.parse(importlib.metadata.version("optimum")) - autogptq_version = version.parse(importlib.metadata.version("auto_gptq")) - if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"): - raise ValueError( - f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}" - ) if self.modules_in_block_to_quantize is not None: optimum_version = version.parse(importlib.metadata.version("optimum")) if optimum_version < version.parse("1.15.0"): diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 50f0f696d57e..41160c376d88 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -27,7 +27,7 @@ require_torch_multi_gpu, slow, ) -from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available +from transformers.utils import is_gptqmodel_available, is_ipex_available if is_torch_available(): @@ -83,7 +83,7 @@ class GPTQTest(unittest.TestCase): input_text = "Hello my name is" EXPECTED_OUTPUTS = set() - # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions + # flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I") EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") @@ -105,10 +105,10 @@ class GPTQTest(unittest.TestCase): use_exllama = False dataset = [ - "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." + "gptqmodel is an easy-to-use model quantization library with user-friendly APIs, based on the GPTQ algorithm." ] - device_map = "cpu" if is_gptqmodel_available() else None + device_map = "cpu" # called only once for all test in this class @classmethod @@ -177,36 +177,27 @@ def test_quantized_layers_class(self): Simple test to check if the model conversion has been done correctly by checking on the class type of the linear layers of the converted models """ - if is_gptqmodel_available(): - from gptqmodel.utils.importer import hf_select_quant_linear + if not is_gptqmodel_available(): + self.skipTest("gptqmodel not available") - if hasattr(self.config, "quantization_config"): - checkpoint_format = self.config.quantization_config.get("checkpoint_format") - meta = self.config.quantization_config.get("meta") - else: - checkpoint_format = "gptq" - meta = None - QuantLinear = hf_select_quant_linear( - bits=self.bits, - group_size=self.group_size, - desc_act=self.desc_act, - sym=self.sym, - device_map=self.device_map, - checkpoint_format=checkpoint_format, - meta=meta, - backend=self.quantization_config.backend, - ) - elif is_auto_gptq_available(): - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear - - QuantLinear = hf_select_quant_linear( - use_triton=False, - desc_act=self.desc_act, - group_size=self.group_size, - bits=self.bits, - disable_exllama=not self.use_exllama, - disable_exllamav2=True, - ) + from gptqmodel.utils.importer import hf_select_quant_linear + + if hasattr(self.config, "quantization_config"): + checkpoint_format = self.config.quantization_config.get("checkpoint_format") + meta = self.config.quantization_config.get("meta") + else: + checkpoint_format = "gptq" + meta = None + QuantLinear = hf_select_quant_linear( + bits=self.bits, + group_size=self.group_size, + desc_act=self.desc_act, + sym=self.sym, + device_map=self.device_map, + checkpoint_format=checkpoint_format, + meta=meta, + backend=self.quantization_config.backend, + ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) def check_inference_correctness(self, model): @@ -244,28 +235,17 @@ def test_serialization(self): """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - if is_auto_gptq_available() and not is_gptqmodel_available(): - quant_type = "cuda-old" if not self.use_exllama else "exllama" - if not self.use_exllama: - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4) - ) - if self.device_map != "cpu": - quantized_model_from_saved = quantized_model_from_saved.to(0) - else: - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, device_map=self.device_map - ) + if not is_gptqmodel_available(): + self.skipTest("gptqmodel not available") + if self.device_map == "cpu": + quant_type = "ipex" if is_ipex_available() else "torch" else: - if self.device_map == "cpu": - quant_type = "ipex" if is_ipex_available() else "torch" - else: - # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 - # TODO: Remove this once GPTQModel exllama kernels supports packing - quant_type = "tritonv2" - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, device_map=self.device_map - ) + # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 + # TODO: Remove this once GPTQModel exllama kernels supports packing + quant_type = "tritonv2" + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( + tmpdirname, device_map=self.device_map + ) self.check_quantized_layers_type(quantized_model_from_saved, quant_type) self.check_inference_correctness(quantized_model_from_saved) @@ -292,15 +272,17 @@ def test_change_loading_attributes(self): """ with tempfile.TemporaryDirectory() as tmpdirname: self.quantized_model.save_pretrained(tmpdirname) - if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama: - self.check_quantized_layers_type(self.quantized_model, "cuda-old") - # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map - ) - self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits) - self.check_quantized_layers_type(quantized_model_from_saved, "exllama") - self.check_inference_correctness(quantized_model_from_saved) + if not is_gptqmodel_available(): + self.skipTest("gptqmodel not available") + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( + tmpdirname, + quantization_config=GPTQConfig(use_exllama=self.use_exllama, bits=self.bits), + device_map=self.device_map, + ) + self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits) + quant_type = "tritonv2" if self.device_map != "cpu" else ("ipex" if is_ipex_available() else "torch") + self.check_quantized_layers_type(quantized_model_from_saved, quant_type) + self.check_inference_correctness(quantized_model_from_saved) @require_accelerate @@ -329,7 +311,7 @@ class GPTQTestActOrderExllama(unittest.TestCase): """ EXPECTED_OUTPUTS = set() - # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions + # flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.") # 4bit + act_order + 128g model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ" @@ -405,7 +387,7 @@ class GPTQTestExllamaV2(unittest.TestCase): """ EXPECTED_OUTPUTS = set() - # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions + # flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.") # 4bit + act_order + 128g model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ" @@ -426,18 +408,14 @@ def setUpClass(cls): cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True) def test_quantized_layers_type(self): - if is_auto_gptq_available() and not is_gptqmodel_available(): - self.assertEqual( - self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, - "exllamav2", - ) - else: - # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 - # TODO: Remove this once GPTQModel exllama kernels supports packing - self.assertEqual( - self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, - "tritonv2", - ) + if not is_gptqmodel_available(): + self.skipTest("gptqmodel not available") + # We expect tritonv2 to be used here, because gptqmodel exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 + # TODO: Remove this once GPTQModel exllama kernels supports packing + self.assertEqual( + self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, + "tritonv2", + ) def check_inference_correctness(self, model): """ From 0400ee5eff583cb3ba81dc65c338c538ff598b10 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Tue, 14 Oct 2025 07:44:29 +0000 Subject: [PATCH 02/34] remove use_cuda and use_exllama toggles are fully deprecated in gptqmodel --- docs/source/en/quantization/gptq.md | 35 ----------- docs/source/ko/quantization/gptq.md | 27 --------- src/transformers/utils/quantization_config.py | 59 ++----------------- tests/quantization/gptq/test_gptq.py | 5 +- 4 files changed, 5 insertions(+), 121 deletions(-) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 812d13396c7b..51ecfd825b12 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -107,41 +107,6 @@ from transformers import AutoModelForCausalLM, GPTQConfig model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin")) ``` -## ExLlama - -> [!WARNING] -> Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT. - -[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. - -To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter in [`GPTQConfig`]. - -```py -import torch -from transformers import AutoModelForCausalLM, GPTQConfig - -gptq_config = GPTQConfig(bits=4, exllama_config={"version":2}) -model = AutoModelForCausalLM.from_pretrained( - "{your_username}/opt-125m-gptq", - device_map="auto", - quantization_config=gptq_config -) -``` - -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file. - -```py -import torch -from transformers import AutoModelForCausalLM, GPTQConfig - -gptq_config = GPTQConfig(bits=4, use_exllama=False) -model = AutoModelForCausalLM.from_pretrained( - "{your_username}/opt-125m-gptq", - device_map="cpu", - quantization_config=gptq_config -) -``` - ## GPT-QModel] GPT-QModel] is the actively maintained backend for GPTQ in Transformers. It was originally forked from AutoGPTQ, but has since diverged with significant improvements such as faster quantization, lower memory usage, and more accurate defaults. diff --git a/docs/source/ko/quantization/gptq.md b/docs/source/ko/quantization/gptq.md index ac8c5f62adc4..8c18b6cf2215 100644 --- a/docs/source/ko/quantization/gptq.md +++ b/docs/source/ko/quantization/gptq.md @@ -91,30 +91,3 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto") ``` - -## ExLlama [[exllama]] - -[ExLlama](https://github.com/turboderp/exllama)은 [Llama](model_doc/llama) 모델의 Python/C++/CUDA 구현체로, 4비트 GPTQ 가중치를 사용하여 더 빠른 추론을 위해 설계되었습니다(이 [벤치마크](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)를 참고하세요). ['GPTQConfig'] 객체를 생성할 때 ExLlama 커널이 기본적으로 활성화됩니다. 추론 속도를 더욱 높이기 위해, `exllama_config` 매개변수를 구성하여 [ExLlamaV2](https://github.com/turboderp/exllamav2) 커널을 사용할 수 있습니다: - -```py -import torch -from transformers import AutoModelForCausalLM, GPTQConfig - -gptq_config = GPTQConfig(bits=4, exllama_config={"version":2}) -model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config) -``` - - - -4비트 모델만 지원되며, 양자화된 모델을 PEFT로 미세 조정하는 경우 ExLlama 커널을 비활성화할 것을 권장합니다. - - - -ExLlama 커널은 전체 모델이 GPU에 있을 때만 지원됩니다. CPU에서 추론을 수행하는 경우 [`GPTQConfig`]에서 ExLlama 커널을 비활성화해야 합니다. 이를 위해 config.json 파일의 양자화 설정에서 ExLlama 커널과 관련된 속성을 덮어써야 합니다. - -```py -import torch -from transformers import AutoModelForCausalLM, GPTQConfig -gptq_config = GPTQConfig(bits=4, use_exllama=False) -model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config) -``` \ No newline at end of file diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index a12731d752b8..de03d99b89e4 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -664,8 +664,6 @@ class GPTQConfig(QuantizationConfigMixin): backend (`str`, *optional*): Controls which kernel to use. Valid values for gptqmodel are `auto`, `auto_trainable` and more. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py - use_cuda_fp16 (`bool`, *optional*, defaults to `False`): - Whether or not to use optimized CUDA kernels for fp16 models. Need to have model in fp16. model_seqlen (`int`, *optional*): The maximum sequence length that the model can take. block_name_to_quantize (`str`, *optional*): @@ -676,14 +674,9 @@ class GPTQConfig(QuantizationConfigMixin): The batch size used when processing the dataset pad_token_id (`int`, *optional*): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - use_exllama (`bool`, *optional*): - Whether to use exllama backend. Defaults to `True` if unset. Only works with `bits` = 4. max_input_length (`int`, *optional*): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. - exllama_config (`dict[str, Any]`, *optional*): - The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults - to `{"version": 1}` if unset. cache_block_outputs (`bool`, *optional*, defaults to `True`): Whether to cache block outputs to reuse as inputs for the succeeding block. modules_in_block_to_quantize (`list[list[str]]`, *optional*): @@ -708,15 +701,12 @@ def __init__( checkpoint_format: str = "gptq", meta: Optional[dict[str, Any]] = None, backend: Optional[str] = None, - use_cuda_fp16: bool = False, model_seqlen: Optional[int] = None, block_name_to_quantize: Optional[str] = None, module_name_preceding_first_block: Optional[list[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - use_exllama: Optional[bool] = None, max_input_length: Optional[int] = None, - exllama_config: Optional[dict[str, Any]] = None, cache_block_outputs: bool = True, modules_in_block_to_quantize: Optional[list[list[str]]] = None, **kwargs, @@ -733,28 +723,19 @@ def __init__( self.checkpoint_format = checkpoint_format.lower() self.meta = meta self.backend = backend.lower() if isinstance(backend, str) else backend - self.use_cuda_fp16 = use_cuda_fp16 self.model_seqlen = model_seqlen self.block_name_to_quantize = block_name_to_quantize self.module_name_preceding_first_block = module_name_preceding_first_block self.batch_size = batch_size self.pad_token_id = pad_token_id - self.use_exllama = use_exllama self.max_input_length = max_input_length - self.exllama_config = exllama_config self.cache_block_outputs = cache_block_outputs self.modules_in_block_to_quantize = modules_in_block_to_quantize self.post_init() def get_loading_attributes(self): attributes_dict = copy.deepcopy(self.__dict__) - loading_attributes = [ - "use_exllama", - "exllama_config", - "use_cuda_fp16", - "max_input_length", - "backend", - ] + loading_attributes = ["max_input_length", "backend"] loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes} return loading_attributes_dict @@ -788,29 +769,7 @@ def post_init(self): # make sure backend default stays consistent with gptqmodel expectations if is_gptqmodel_available() and self.backend is None: - self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto" - - if self.use_exllama is None: - # New default behaviour - self.use_exllama = True - - if self.exllama_config is None: - self.exllama_config = {"version": ExllamaVersion.ONE} - else: - if "version" not in self.exllama_config: - raise ValueError("`exllama_config` needs to have a `version` key.") - elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: - exllama_version = self.exllama_config["version"] - raise ValueError( - f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}" - ) - - if self.bits == 4 and self.use_exllama: - if self.exllama_config["version"] == ExllamaVersion.ONE: - logger.info( - "You have activated exllama backend. Note that you can get better inference " - "speed using exllamav2 kernel by setting `exllama_config`." - ) + self.backend = "auto" if self.modules_in_block_to_quantize is not None: optimum_version = version.parse(importlib.metadata.version("optimum")) if optimum_version < version.parse("1.15.0"): @@ -819,18 +778,13 @@ def post_init(self): ) def to_dict(self) -> dict[str, Any]: - config_dict = super().to_dict() - config_dict.pop("disable_exllama", None) - return config_dict + return super().to_dict() def to_dict_optimum(self): """ Get compatible dict for optimum gptq config """ - quant_dict = self.to_dict() - # make it compatible with optimum config - quant_dict["disable_exllama"] = not self.use_exllama - return quant_dict + return self.to_dict() @classmethod def from_dict_optimum(cls, config_dict): @@ -838,11 +792,6 @@ def from_dict_optimum(cls, config_dict): Get compatible class with optimum gptq config dict """ - if "disable_exllama" in config_dict: - config_dict["use_exllama"] = not config_dict["disable_exllama"] - # switch to None to not trigger the warning - config_dict.pop("disable_exllama") - config = cls(**config_dict) return config diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index 41160c376d88..e20225157a0b 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -102,8 +102,6 @@ class GPTQTest(unittest.TestCase): sym = True group_size = 128 desc_act = False - use_exllama = False - dataset = [ "gptqmodel is an easy-to-use model quantization library with user-friendly APIs, based on the GPTQ algorithm." ] @@ -131,7 +129,6 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, sym=cls.sym, - use_exllama=cls.use_exllama, ) cls.quantized_model = AutoModelForCausalLM.from_pretrained( @@ -276,7 +273,7 @@ def test_change_loading_attributes(self): self.skipTest("gptqmodel not available") quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( tmpdirname, - quantization_config=GPTQConfig(use_exllama=self.use_exllama, bits=self.bits), + quantization_config=GPTQConfig(bits=self.bits), device_map=self.device_map, ) self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits) From cada621adc100f33ff16a860d28a77ed7c795581 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Tue, 14 Oct 2025 07:52:01 +0000 Subject: [PATCH 03/34] format --- src/transformers/quantizers/quantizer_gptq.py | 4 +--- src/transformers/utils/quantization_config.py | 2 ++ tests/quantization/gptq/test_gptq.py | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py index a11f2ed2f7eb..305e6925e753 100644 --- a/src/transformers/quantizers/quantizer_gptq.py +++ b/src/transformers/quantizers/quantizer_gptq.py @@ -60,9 +60,7 @@ def validate_environment(self, *args, **kwargs): if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("GPU is required to quantize or run quantize model.") elif not is_gptqmodel_available(): - raise ImportError( - "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) library." - ) + raise ImportError("Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) library.") elif is_gptqmodel_available() and ( version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3") or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99") diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index de03d99b89e4..ad9dd1aeecff 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -37,6 +37,8 @@ is_torchao_available, logging, ) + + if is_torch_available(): import torch diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index e20225157a0b..b2cb972b5974 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -240,9 +240,7 @@ def test_serialization(self): # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354 # TODO: Remove this once GPTQModel exllama kernels supports packing quant_type = "tritonv2" - quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( - tmpdirname, device_map=self.device_map - ) + quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) self.check_quantized_layers_type(quantized_model_from_saved, quant_type) self.check_inference_correctness(quantized_model_from_saved) From b82e2913578ca23f99f68a8bb342dee398d019b8 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Tue, 14 Oct 2025 08:31:12 +0000 Subject: [PATCH 04/34] add `act_group_aware` property --- src/transformers/utils/quantization_config.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index ad9dd1aeecff..31eaa39cfa3e 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -652,6 +652,9 @@ class GPTQConfig(QuantizationConfigMixin): desc_act (`bool`, *optional*, defaults to `False`): Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly speed up inference but the perplexity may become slightly worse. Also known as act-order. + act_group_aware (`bool`, *optional*, defaults to `True`): + Use GAR (group aware activation order) during quantization. Has measurable positive impact on quantization + quality. Only applicable when `desc_act = False`. Will forced to be `False` when `desc_act = True`. sym (`bool`, *optional*, defaults to `True`): Whether to use symmetric quantization. true_sequential (`bool`, *optional*, defaults to `True`): @@ -698,6 +701,7 @@ def __init__( group_size: int = 128, damp_percent: float = 0.1, desc_act: bool = False, + act_group_aware: bool = True, sym: bool = True, true_sequential: bool = True, checkpoint_format: str = "gptq", @@ -720,6 +724,7 @@ def __init__( self.group_size = group_size self.damp_percent = damp_percent self.desc_act = desc_act + self.act_group_aware = act_group_aware self.sym = sym self.true_sequential = true_sequential self.checkpoint_format = checkpoint_format.lower() @@ -769,8 +774,13 @@ def post_init(self): ['wikitext2','c4','c4-new'], but we found {self.dataset}""" ) + # act_group_order is only applicable when `desc_act = False` + if self.desc_act and self.act_group_aware: + self.act_group_aware = False + logger.warning("`act_group_aware` has been auto-disabled as it is not compatible with `desc_act = True`.") + # make sure backend default stays consistent with gptqmodel expectations - if is_gptqmodel_available() and self.backend is None: + if self.backend is None: self.backend = "auto" if self.modules_in_block_to_quantize is not None: optimum_version = version.parse(importlib.metadata.version("optimum")) From c1d907f724c7a3d5c4f8e1137fd4704bf1a306d6 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 15 Oct 2025 10:07:29 +0800 Subject: [PATCH 05/34] fix QUANT_TYPE assert Signed-off-by: ZX-ModelCloud --- tests/quantization/gptq/test_gptq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index b2cb972b5974..a51850f31aa4 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -213,7 +213,7 @@ def check_inference_correctness(self, model): self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) def check_quantized_layers_type(self, model, value): - self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value) + self.assertEqual(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE, value) def test_generate_quality(self): """ @@ -317,7 +317,7 @@ def setUpClass(cls): """ Setup quantized model """ - cls.quantization_config = GPTQConfig(bits=4, max_input_length=4028) + cls.quantization_config = GPTQConfig(bits=4, max_input_length=4028, backend="exllama_v1") cls.quantized_model = AutoModelForCausalLM.from_pretrained( cls.model_name, dtype=torch.float16, @@ -393,7 +393,7 @@ def setUpClass(cls): """ Setup quantized model """ - cls.quantization_config = GPTQConfig(bits=4, exllama_config={"version": 2}) + cls.quantization_config = GPTQConfig(bits=4, backend="exllama_v2") cls.quantized_model = AutoModelForCausalLM.from_pretrained( cls.model_name, dtype=torch.float16, @@ -409,7 +409,7 @@ def test_quantized_layers_type(self): # TODO: Remove this once GPTQModel exllama kernels supports packing self.assertEqual( self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, - "tritonv2", + "exllamav2", ) def check_inference_correctness(self, model): From 8a7da2a27e79aca44043ea98e75f7623d534e01c Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 15 Oct 2025 12:35:28 +0000 Subject: [PATCH 06/34] format --- src/transformers/utils/quantization_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 31eaa39cfa3e..a7aff5f5f509 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -30,7 +30,6 @@ from ..utils import ( is_auto_awq_available, is_compressed_tensors_available, - is_gptqmodel_available, is_hqq_available, is_quark_available, is_torch_available, From 18a6d80fc1ad7a4b6289073497f768edd3052891 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 10:49:32 +0800 Subject: [PATCH 07/34] mod awq import --- src/transformers/integrations/awq.py | 22 +++++++++++----------- tests/quantization/autoawq/test_awq.py | 6 ++++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index c09da6c92e6c..d09723ccddf3 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -75,7 +75,7 @@ def replace_quantization_scales(model, model_type): - from awq.modules.act import ScaledActivation + from gptqmodel.quantization.awq.modules.act import ScaledActivation if model_type not in AWQ_SCALES_MAPPINGS: return model @@ -131,26 +131,26 @@ def replace_with_awq_linear( if backend == AwqBackendPackingMethod.AUTOAWQ: if quantization_config.version == AWQLinearVersion.GEMM: - from awq.modules.linear.gemm import WQLinear_GEMM + from gptqmodel.quantization.awq.modules.linear.gemm import WQLinear_GEMM target_cls = WQLinear_GEMM elif quantization_config.version == AWQLinearVersion.GEMV: - from awq.modules.linear.gemv import WQLinear_GEMV + from gptqmodel.quantization.awq.modules.linear.gemv import WQLinear_GEMV target_cls = WQLinear_GEMV elif quantization_config.version == AWQLinearVersion.EXLLAMA: if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from awq.modules.linear.exllama import WQLinear_Exllama + from gptqmodel.quantization.awq.modules.linear.exllama import WQLinear_Exllama target_cls = WQLinear_Exllama elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from awq.modules.linear.exllamav2 import WQLinear_ExllamaV2 + from gptqmodel.quantization.awq.modules.linear.exllamav2 import WQLinear_ExllamaV2 target_cls = WQLinear_ExllamaV2 else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: - from awq.modules.linear.gemm_ipex import WQLinear_IPEX + from gptqmodel.quantization.awq.modules.linear.gemm_ipex import WQLinear_IPEX target_cls = WQLinear_IPEX else: @@ -383,7 +383,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na The `QuantAttentionFused` class as it only supports that class for now. """ - from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV + from gptqmodel.quantization.awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV module_has_been_fused = False @@ -401,7 +401,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na linear_target_cls = WQLinear_GEMM cat_dim = 1 elif is_ipex_available() and version.parse(importlib.metadata.version("autoawq")) > version.parse("0.2.6"): - from awq.modules.linear import WQLinear_IPEX + from gptqmodel.quantization.awq.modules.linear import WQLinear_IPEX if isinstance(q_proj, WQLinear_IPEX): linear_target_cls = WQLinear_IPEX @@ -468,11 +468,11 @@ def post_init_awq_exllama_modules(model, exllama_config): """ if exllama_config["version"] == ExllamaVersion.ONE: - from awq.modules.linear.exllama import exllama_post_init + from gptqmodel.quantization.awq.modules.linear.exllama import exllama_post_init model = exllama_post_init(model) elif exllama_config["version"] == ExllamaVersion.TWO: - from awq.modules.linear.exllamav2 import exllamav2_post_init + from gptqmodel.quantization.awq.modules.linear.exllamav2 import exllamav2_post_init model = exllamav2_post_init( model, @@ -491,7 +491,7 @@ def post_init_awq_ipex_modules(model): - Weights packing, reordering and repacking """ - from awq.modules.linear.gemm_ipex import ipex_post_init + from gptqmodel.quantization.awq.modules.linear.gemm_ipex import ipex_post_init model = ipex_post_init(model) diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 78c694a848fc..3d4032d8b8c8 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -150,7 +150,7 @@ def test_quantized_model_conversion(self): """ Simple test that checks if the quantized model has been converted properly """ - from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV + from gptqmodel.quantization.awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV from transformers.integrations.awq import replace_with_awq_linear @@ -522,7 +522,9 @@ class AwqScaleTest(unittest.TestCase): model_name = "TechxGenus/starcoder2-3b-AWQ" def test_load_quantized_model(self): - from awq.modules.act import ScaledActivation + from gptqmodel.quantization.awq.modules.act import ScaledActivation + + """ Simple test that checks if the scales have been replaced in the quantized model From fece25c2d27d543896196d970b38c20c4c553ad3 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 10:52:48 +0800 Subject: [PATCH 08/34] remove autoawq fuse support --- src/transformers/integrations/__init__.py | 2 - src/transformers/integrations/awq.py | 223 ------------------- src/transformers/quantizers/quantizer_awq.py | 6 - 3 files changed, 231 deletions(-) diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 15dd7518150c..7737e3c5b606 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -19,7 +19,6 @@ _import_structure = { "aqlm": ["replace_with_aqlm_linear"], "awq": [ - "fuse_awq_modules", "post_init_awq_exllama_modules", "post_init_awq_ipex_modules", "replace_quantization_scales", @@ -164,7 +163,6 @@ if TYPE_CHECKING: from .aqlm import replace_with_aqlm_linear from .awq import ( - fuse_awq_modules, post_init_awq_exllama_modules, post_init_awq_ipex_modules, replace_quantization_scales, diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index d09723ccddf3..c3417e09a933 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -237,229 +237,6 @@ def get_modules_to_fuse(model, quantization_config): return current_fused_mapping -def fuse_awq_modules(model, quantization_config): - """ - Optionally fuse some modules in the model to speedup inference. - - Args: - model (`~PreTrainedModel`): - The model to fuse - note this model should have been converted into AWQ format beforehand. - quantization_config (`Union[AwqConfig, dict]`): - The quantization configuration to use. - """ - # We need to convert it from dict in order to get an AwqConfig object - # otherwise the fields `backend` etc. will not be available - # https://github.com/huggingface/transformers/pull/27411#discussion_r1414044495 - if isinstance(quantization_config, dict): - quantization_config = AwqConfig.from_dict(quantization_config) - backend = quantization_config.backend - - modules_to_fuse = get_modules_to_fuse(model, quantization_config) - modules_to_not_convert = getattr(quantization_config, "modules_to_not_convert", None) - - if backend == AwqBackendPackingMethod.AUTOAWQ: - from awq.modules.fused.attn import QuantAttentionFused - from awq.modules.fused.mlp import QuantFusedMLP - from awq.modules.fused.norm import FasterTransformerRMSNorm - else: - raise ValueError("Fusing is only supported for the AutoAWQ backend") - - fused_attention_modules = [] - - for name, module in model.named_modules(): - if modules_to_not_convert is not None: - if any(module_name_to_not_convert in name for module_name_to_not_convert in modules_to_not_convert): - continue - - # Replace layer norms - _fuse_awq_layernorm(modules_to_fuse["layernorm"], module, FasterTransformerRMSNorm) - - # Replace MLP layers if awq version is not ipex. - if quantization_config.version != "ipex": - _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP) - else: - logger.info("The IPEX version AWQ does not support fuse mlp for now.") - - # Replace attention layers - attention_has_been_fused = _fuse_awq_attention_layers( - model, module, modules_to_fuse, name, QuantAttentionFused - ) - - if attention_has_been_fused: - fused_attention_modules.append(name.split(".")[0]) - - # For AWQ fused + Llama we need to set `config._attn_implementation` = "custom" to avoid unexpected behavior and pass - # `None` attention mask to the fused attention modules as now the attention mask is dropped by our models and dealt - # by the `AttentionMaskConverter` module. - if len(fused_attention_modules) > 0: - for module_name, module in model.named_modules(): - if any( - module_name in fused_attention_modules for fused_attention_parent_module in fused_attention_modules - ): - if hasattr(module, "config") and hasattr(module.config, "_attn_implementation"): - module.config._attn_implementation = "custom" - return model - - -def _fuse_awq_layernorm(fuse_module_names, module, target_cls): - """ - Fuse the LayerNorm layers into a target class using autoawq - - Args: - fuse_module_names (`list[str]`): - The list of module names to fuse - module (`nn.Module`): - The pytorch parent module that has layernorm modules to fuse - target_cls (`~autoawq.FasterTransformerRMSNorm`): - The `FasterTransformerRMSNorm` class as it only supports that class - for now. - """ - for module_name in fuse_module_names: - if hasattr(module, module_name): - old_module = getattr(module, module_name) - module._modules[module_name] = target_cls( - old_module.weight, - old_module.variance_epsilon, - ).to(old_module.weight.device) - del old_module - - -def _fuse_awq_mlp(model, current_module_name, fuse_module_names, module, target_cls): - """ - Fuse the MLP layers into a target class using autoawq - - Args: - model (`~PreTrainedModel`): - The input pretrained model - current_module_name (`str`): - The current submodule name - fuse_module_names (`list[str]`): - The list of module names to fuse. For the MLP layers it has to be an array - of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers) - module (`nn.Module`): - The pytorch parent module that has layernorm modules to fuse - target_cls (`~autoawq.QuantFusedMLP`): - The `QuantFusedMLP` class as it only supports that class - for now. - """ - if len(fuse_module_names) == 0: - return - - if hasattr(module, fuse_module_names[0]): - gate_proj = getattr(module, fuse_module_names[0]) - up_proj = getattr(module, fuse_module_names[1]) - down_proj = getattr(module, fuse_module_names[2]) - - previous_device = gate_proj.qweight.device - - # Deal also with the case model has `text_config` attribute - config = model.config.get_text_config(decoder=True) - hidden_act = config.hidden_act - activation_fn = ACT2FN[hidden_act] - new_module = target_cls(gate_proj, down_proj, up_proj, activation_fn) - - parent_name, child_name = current_module_name.rsplit(".", 1) - parent = model.get_submodule(parent_name) - setattr(parent, child_name, new_module.to(previous_device)) - - del gate_proj, up_proj, down_proj - - -def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_name, target_cls): - """ - Fuse the Attention layers into a target class using autoawq - - Args: - model (`~PreTrainedModel`): - The input pretrained model - module (`nn.Module`): - The pytorch parent module that has layernorm modules to fuse - modules_to_fuse (`list[str]`): - The module fusing mapping. The dictionary has to contain a field `attention` with attention module names - in the correct order: q, k, v, o layer - current_module_name (`str`): - The current submodule name - target_cls (`~autoawq.QuantAttentionFused`): - The `QuantAttentionFused` class as it only supports that class - for now. - """ - from gptqmodel.quantization.awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV - - module_has_been_fused = False - - if len(modules_to_fuse["attention"]) == 0: - return module_has_been_fused - - if hasattr(module, modules_to_fuse["attention"][0]): - # First, we pack the QKV layers together - q_proj = getattr(module, modules_to_fuse["attention"][0]) - - if isinstance(q_proj, WQLinear_GEMV): - linear_target_cls = WQLinear_GEMV - cat_dim = 0 - elif isinstance(q_proj, WQLinear_GEMM): - linear_target_cls = WQLinear_GEMM - cat_dim = 1 - elif is_ipex_available() and version.parse(importlib.metadata.version("autoawq")) > version.parse("0.2.6"): - from gptqmodel.quantization.awq.modules.linear import WQLinear_IPEX - - if isinstance(q_proj, WQLinear_IPEX): - linear_target_cls = WQLinear_IPEX - cat_dim = 1 - else: - raise ValueError("Unsupported q_proj type: {type(q_proj)}") - - previous_device = q_proj.qweight.device - - k_proj = getattr(module, modules_to_fuse["attention"][1]) - v_proj = getattr(module, modules_to_fuse["attention"][2]) - o_proj = getattr(module, modules_to_fuse["attention"][3]) - - bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None - - qkv_layer = linear_target_cls( - q_proj.w_bit, - q_proj.group_size, - q_proj.in_features, - q_proj.out_features + k_proj.out_features + v_proj.out_features, - q_proj.bias is not None, - next(iter(module.state_dict().values())).device, - ) - - qkv_layer.qweight = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=cat_dim) - qkv_layer.qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=cat_dim) - qkv_layer.scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=cat_dim) - - if isinstance(qkv_layer, WQLinear_GEMV): - qkv_layer.split_k_iters = q_proj.split_k_iters - - qkv_layer.bias = bias - - fused_attention_layer = target_cls( - modules_to_fuse["hidden_size"], - modules_to_fuse["num_attention_heads"], - modules_to_fuse["num_key_value_heads"], - qkv_layer, - o_proj, - previous_device, - modules_to_fuse["max_seq_len"], - use_alibi=modules_to_fuse["use_alibi"], - # The default value in autoawq is set to 10000.0 - rope_theta=modules_to_fuse.get("rope_theta", 10000.0), - ) - - fused_attention_layer.is_hf_transformers = True - - parent_name, child_name = current_module_name.rsplit(".", 1) - parent = model.get_submodule(parent_name) - setattr(parent, child_name, fused_attention_layer.to(previous_device)) - - del q_proj, k_proj, v_proj, o_proj - module_has_been_fused = True - - return module_has_been_fused - - def post_init_awq_exllama_modules(model, exllama_config): """ Runs post init for Exllama layers which performs: diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index d35b04c3bb52..a0a0dc55f6ef 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -129,12 +129,6 @@ def _process_model_before_weight_loading( ) def _process_model_after_weight_loading(self, model, **kwargs): - if self.quantization_config.do_fuse: - from ..integrations import fuse_awq_modules - - model = fuse_awq_modules(model, self.quantization_config) - model._awq_is_fused = True # TODO: consider storing this flag in model.config instead - if self.quantization_config.version == AWQLinearVersion.EXLLAMA: from ..integrations import post_init_awq_exllama_modules From 000e2231167497a632d7661a78f13e27b5d67a5d Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 11:19:31 +0800 Subject: [PATCH 09/34] remove remove autoawq.config fuse --- src/transformers/utils/quantization_config.py | 47 +++---------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index a38b8560a9ed..c706c90ae926 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -810,11 +810,11 @@ class AwqConfig(QuantizationConfigMixin): The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users that quantize their own models using `llm-awq` library. do_fuse (`bool`, *optional*, defaults to `False`): - Whether to fuse attention and mlp layers together for faster inference + Deprecated, Whether to fuse attention and mlp layers together for faster inference fuse_max_seq_len (`int`, *optional*): - The Maximum sequence length to generate when using fusing. + Deprecated, The Maximum sequence length to generate when using fusing. modules_to_fuse (`dict`, *optional*, default to `None`): - Overwrite the natively supported fusing scheme with the one specified by the users. + Deprecated, Overwrite the natively supported fusing scheme with the one specified by the users. modules_to_not_convert (`list`, *optional*, default to `None`): The list of modules to not quantize, useful for quantizing models that explicitly require to have some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). @@ -850,11 +850,9 @@ def __init__( self.modules_to_not_convert = modules_to_not_convert self.exllama_config = exllama_config - self.modules_to_fuse = modules_to_fuse - if do_fuse is None: - self.do_fuse = modules_to_fuse is not None and len(modules_to_fuse) > 0 - else: - self.do_fuse = do_fuse + if do_fuse or modules_to_fuse: + raise ValueError("awq fuse feature is deprecated") + self.fuse_max_seq_len = fuse_max_seq_len self.post_init() @@ -889,24 +887,6 @@ def post_init(self): if major < 8: raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0") - if self.do_fuse and self.fuse_max_seq_len is None: - raise ValueError( - "You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase" - ) - - if self.do_fuse: - awq_version_supports_fusing = False - MIN_AWQ_VERSION = "0.1.7" - if is_auto_awq_available(): - awq_version_supports_fusing = version.parse(importlib.metadata.version("autoawq")) >= version.parse( - MIN_AWQ_VERSION - ) - - if not awq_version_supports_fusing: - raise ValueError( - f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}." - ) - if self.modules_to_not_convert is not None: awq_version_supports_non_conversion = False MIN_AWQ_VERSION = "0.1.8" @@ -920,21 +900,6 @@ def post_init(self): f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}." ) - if self.do_fuse and self.modules_to_fuse is not None: - required_keys = [ - "hidden_size", - "num_attention_heads", - "num_key_value_heads", - "mlp", - "attention", - "layernorm", - "use_alibi", - ] - if not all(key in self.modules_to_fuse for key in required_keys): - raise ValueError( - f"Required fields are missing in the fusing mapping, required fields are {required_keys}" - ) - if self.version == AWQLinearVersion.EXLLAMA: awq_version_supports_exllama = False MIN_AWQ_VERSION = "0.2.0" From c9f9c02724a1148839e4c3d878687e9d16060be3 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 11:20:30 +0800 Subject: [PATCH 10/34] cleanup --- src/transformers/quantizers/quantizer_awq.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index a0a0dc55f6ef..2dfc5d9c0de7 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -140,11 +140,6 @@ def _process_model_after_weight_loading(self, model, **kwargs): model = post_init_awq_ipex_modules(model) def is_serializable(self, safe_serialization=None): - # AWQ through auto-awq has been always serializable, except if the model is fused. - if self.quantization_config.do_fuse: - logger.warning("You cannot save an AWQ model that uses fused modules!") - return False - if self.quantization_config.version == AWQLinearVersion.EXLLAMA: logger.warning("You cannot save an AWQ model that uses Exllama backend!") return False From d839d2bb4918ff7ee13d554e3febb1ebb555da51 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 11:22:52 +0800 Subject: [PATCH 11/34] remove awq fuse test --- tests/quantization/autoawq/test_awq.py | 227 ------------------------- 1 file changed, 227 deletions(-) diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 3d4032d8b8c8..aef41a2fa3ff 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -287,233 +287,6 @@ def test_quantized_model_no_k_proj_quantized(self): output = quantized_model.generate(dummy_input, max_new_tokens=10) self.assertTrue((EXPECTED_OUTPUT == output).all()) - -@slow -@require_torch_accelerator -@require_auto_awq -@require_accelerate -class AwqFusedTest(unittest.TestCase): - model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ" - model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510" - - custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ" - custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7" - - mixtral_model_name = "casperhansen/mixtral-instruct-awq" - mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b" - - multi_modal_model_name = "ybelkada/llava-1.5-7b-hf-awq" - multi_modal_model_code_revision = "ad108a50f5b9e681bdd7378409f57b7fa59a7442" - - prompt = ( - "You're standing on the surface of the Earth. " - "You walk one mile south, one mile west and one mile north. " - "You end up exactly where you started. Where are you?" - ) - - EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square." - EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20" - EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe" - - def tearDown(self): - gc.collect() - backend_empty_cache(torch_device) - gc.collect() - - def _check_fused_modules(self, model): - has_fused_modules = False - fused_modules_name = ["QuantAttentionFused", "QuantFusedMLP", "FasterTransformerRMSNorm"] - - for _, module in model.named_modules(): - if module.__class__.__name__ in fused_modules_name: - has_fused_modules = True - break - - self.assertTrue(has_fused_modules, "Modules fusing not performed correctly!") - - def test_raise_save_pretrained(self): - """ - Test that `save_pretrained` is effectively blocked for fused models - """ - quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True) - - model = AutoModelForCausalLM.from_pretrained( - self.model_name, - quantization_config=quantization_config, - revision=self.model_revision, - ).to(torch_device) - - self._check_fused_modules(model) - - with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - def test_fused_modules_to_not_convert(self): - """ - Test if fused + modules to_not_convert work as expected - """ - model_id = "hf-internal-testing/Mixtral-tiny-AWQ" - - quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True) - model = AutoModelForCausalLM.from_pretrained( - model_id, - quantization_config=quantization_config, - ).to(torch_device) - - # Check if model has been correctly fused - self._check_fused_modules(model) - # Checks if the modules_to_not_convert (here gate layer) is a Linear - self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear)) - - @unittest.skipIf( - get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8, - "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", - ) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - def test_generation_fused(self): - """ - Test generation quality for fused models - single batch case - """ - quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True) - - model = AutoModelForCausalLM.from_pretrained( - self.model_name, - quantization_config=quantization_config, - revision=self.model_revision, - ).to(torch_device) - - self._check_fused_modules(model) - - tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision) - - inputs = tokenizer(self.prompt, return_tensors="pt").to(torch_device) - - outputs = model.generate(**inputs, max_new_tokens=12) - - self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION) - - @pytest.mark.flash_attn_test - @require_flash_attn - @require_torch_gpu - @unittest.skipIf( - get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8, - "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", - ) - def test_generation_fused_batched(self): - """ - Test generation quality for fused models - multi batch case - """ - quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True) - - model = AutoModelForCausalLM.from_pretrained( - self.model_name, - quantization_config=quantization_config, - revision=self.model_revision, - ).to(torch_device) - - self._check_fused_modules(model) - - tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision) - - tokenizer.pad_token_id = tokenizer.eos_token_id - inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device) - - outputs = model.generate(**inputs, max_new_tokens=12) - - self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION) - - def test_generation_llava_fused(self): - from transformers import pipeline - - quantization_config = AwqConfig(do_fuse=True, fuse_max_seq_len=2048) - - pipe = pipeline( - "image-to-text", - model=self.multi_modal_model_name, - device=0, - model_kwargs={ - "quantization_config": quantization_config, - }, - revision=self.multi_modal_model_code_revision, - ) - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png" - - prompt = "USER: \nCan you please describe this image?\nASSISTANT:" - - outputs = pipe(url, prompt=prompt, generate_kwargs={"max_new_tokens": 100}) - EXPECTED_OUTPUT = "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, possibly a carpet or a grassy area. The cat is holding a red ball in its paws, seemingly playing with it. The cat appears to be focused on the ball, possibly preparing to play or just enjoying the toy." - - self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT) - - @pytest.mark.flash_attn_test - @require_flash_attn - @require_torch_multi_gpu - @unittest.skipIf( - get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8, - "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", - ) - def test_generation_custom_model(self): - """ - Test generation quality for fused models using custom fused map. - """ - quantization_config = AwqConfig( - bits=4, - fuse_max_seq_len=512, - modules_to_fuse={ - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - "hidden_size": 4096, - "num_attention_heads": 32, - "num_key_value_heads": 8, - }, - ) - - model = AutoModelForCausalLM.from_pretrained( - self.custom_mapping_model_id, - quantization_config=quantization_config, - device_map="balanced", - revision=self.custom_model_revision, - ) - - self._check_fused_modules(model) - - tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision) - - prompt = "Hello" - inputs = tokenizer(prompt, return_tensors="pt").to(torch_device) - - outputs = model.generate(**inputs, max_new_tokens=12) - self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL) - - @pytest.mark.flash_attn_test - @require_flash_attn - @require_torch_multi_gpu - @unittest.skip(reason="Not enough GPU memory on CI runners") - def test_generation_mixtral_fused(self): - """ - Text generation test for Mixtral + AWQ + fused - """ - quantization_config = AwqConfig(bits=4, fuse_max_seq_len=1024, do_fuse=True) - model = AutoModelForCausalLM.from_pretrained( - self.mixtral_model_name, - quantization_config=quantization_config, - device_map="auto", - revision=self.mixtral_model_revision, - ) - - tokenizer = AutoTokenizer.from_pretrained(self.mixtral_model_name) - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device) - - outputs = model.generate(**inputs, max_new_tokens=12) - self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_MIXTRAL) - - @slow @require_torch_accelerator @require_auto_awq From 32dd6ac13254e832a20253a74c30a2c60d4c98f1 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 13:54:14 +0800 Subject: [PATCH 12/34] fix import --- src/transformers/integrations/awq.py | 20 ++++++++-------- src/transformers/utils/quantization_config.py | 24 ------------------- 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index c3417e09a933..d1d9c9572aea 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -131,28 +131,28 @@ def replace_with_awq_linear( if backend == AwqBackendPackingMethod.AUTOAWQ: if quantization_config.version == AWQLinearVersion.GEMM: - from gptqmodel.quantization.awq.modules.linear.gemm import WQLinear_GEMM + from gptqmodel.nn_modules.qlinear.awq_gemm import AwqGEMMQuantLinear - target_cls = WQLinear_GEMM + target_cls = AwqGEMMQuantLinear elif quantization_config.version == AWQLinearVersion.GEMV: - from gptqmodel.quantization.awq.modules.linear.gemv import WQLinear_GEMV + from gptqmodel.nn_modules.qlinear.awq_gemv import AwqGEMVQuantLinear - target_cls = WQLinear_GEMV + target_cls = AwqGEMVQuantLinear elif quantization_config.version == AWQLinearVersion.EXLLAMA: if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.quantization.awq.modules.linear.exllama import WQLinear_Exllama + from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear - target_cls = WQLinear_Exllama + target_cls = AwqExllamaQuantLinear elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.quantization.awq.modules.linear.exllamav2 import WQLinear_ExllamaV2 + from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear - target_cls = WQLinear_ExllamaV2 + target_cls = AwqExllamaV2QuantLinear else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: - from gptqmodel.quantization.awq.modules.linear.gemm_ipex import WQLinear_IPEX + from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear - target_cls = WQLinear_IPEX + target_cls = TorchFusedAwqQuantLinear else: raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}") else: diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index c706c90ae926..4108779a9790 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -832,10 +832,6 @@ def __init__( zero_point: bool = True, version: AWQLinearVersion = AWQLinearVersion.GEMM, backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ, - do_fuse: bool | None = None, - fuse_max_seq_len: int | None = None, - modules_to_fuse: dict | None = None, - modules_to_not_convert: list | None = None, exllama_config: dict[str, int] | None = None, **kwargs, ): @@ -846,15 +842,8 @@ def __init__( self.zero_point = zero_point self.version = version self.backend = backend - self.fuse_max_seq_len = fuse_max_seq_len - self.modules_to_not_convert = modules_to_not_convert self.exllama_config = exllama_config - if do_fuse or modules_to_fuse: - raise ValueError("awq fuse feature is deprecated") - - self.fuse_max_seq_len = fuse_max_seq_len - self.post_init() def post_init(self): @@ -887,19 +876,6 @@ def post_init(self): if major < 8: raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0") - if self.modules_to_not_convert is not None: - awq_version_supports_non_conversion = False - MIN_AWQ_VERSION = "0.1.8" - if is_auto_awq_available(): - awq_version_supports_non_conversion = version.parse( - importlib.metadata.version("autoawq") - ) >= version.parse(MIN_AWQ_VERSION) - - if not awq_version_supports_non_conversion: - raise ValueError( - f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}." - ) - if self.version == AWQLinearVersion.EXLLAMA: awq_version_supports_exllama = False MIN_AWQ_VERSION = "0.2.0" From ed0c0a3933e48064a5dc4c0b6a014de426d26535 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 14:51:46 +0800 Subject: [PATCH 13/34] use gptqmodel --- src/transformers/utils/quantization_config.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 4108779a9790..9f4c7864efc8 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -27,6 +27,8 @@ from packaging import version +from transformers.utils.import_utils import is_gptqmodel_available + from ..utils import ( is_auto_awq_available, is_compressed_tensors_available, @@ -877,17 +879,17 @@ def post_init(self): raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0") if self.version == AWQLinearVersion.EXLLAMA: - awq_version_supports_exllama = False - MIN_AWQ_VERSION = "0.2.0" - if is_auto_awq_available(): - awq_version_supports_exllama = version.parse(importlib.metadata.version("autoawq")) >= version.parse( - MIN_AWQ_VERSION + gptqmodel_version_supports_awq = False + MIN_GPTQMODEL_SUPPORT_AWQ_VERSION = "5.0.0" + if is_gptqmodel_available(): + gptqmodel_version_supports_awq = version.parse(importlib.metadata.version("gptqmodel")) >= version.parse( + MIN_GPTQMODEL_SUPPORT_AWQ_VERSION ) - if not awq_version_supports_exllama: + if not gptqmodel_version_supports_awq: raise ValueError( - f"You current version of `autoawq` does not support exllama backend, " - f"please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}." + f"You current version of `gptqmodel` does not support awq, " + f"please upgrade `gptqmodel` package to at least {MIN_GPTQMODEL_SUPPORT_AWQ_VERSION}." ) if self.exllama_config is None: From 0cb315da6c82c08dbb5c90e4e9342657c8bd477e Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 14:52:28 +0800 Subject: [PATCH 14/34] cleanup --- src/transformers/utils/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 9f4c7864efc8..e531b5d05618 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -905,7 +905,7 @@ def post_init(self): def get_loading_attributes(self): attributes_dict = copy.deepcopy(self.__dict__) - loading_attributes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"] + loading_attributes = ["version", "exllama_config"] loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes} return loading_attributes_dict From a930c47d459100a277229a6a2fc793e713b4d5d3 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 15:17:42 +0800 Subject: [PATCH 15/34] remove get_modules_to_fuse --- src/transformers/integrations/awq.py | 41 ---------------------------- 1 file changed, 41 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index d1d9c9572aea..0a5e82db7a4c 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -196,47 +196,6 @@ def replace_with_awq_linear( return model, has_been_replaced -def get_modules_to_fuse(model, quantization_config): - """ - Returns the fusing mapping given the quantization config and the model - - Args: - model (`~PreTrainedModel`): - The model to fuse - note this model should have been converted into AWQ format beforehand. - quantization_config (`~transformers.quantization_config.AWQConfig`): - The quantization configuration to use. - """ - if not isinstance(model, PreTrainedModel): - raise TypeError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}") - - # Always default to `quantization_config.modules_to_fuse` - if quantization_config.modules_to_fuse is not None: - current_fused_mapping = quantization_config.modules_to_fuse - current_fused_mapping["max_seq_len"] = quantization_config.fuse_max_seq_len - elif model.config.model_type in AWQ_FUSED_MAPPINGS: - current_fused_mapping = AWQ_FUSED_MAPPINGS[model.config.model_type] - - # Properly deal with the case where we have a multi-modal model as well (e.g. Llava) - config = model.config.get_text_config(decoder=True) - - # Handle hidden_size, num_attention_heads, num_key_value_heads on our own. - hidden_size = config.hidden_size - num_attention_heads = config.num_attention_heads - num_key_value_heads = getattr(config, "num_key_value_heads", num_attention_heads) - - # Fill `current_fused_mapping` with the expected values - current_fused_mapping["hidden_size"] = hidden_size - current_fused_mapping["num_attention_heads"] = num_attention_heads - current_fused_mapping["num_key_value_heads"] = num_key_value_heads - current_fused_mapping["max_seq_len"] = quantization_config.fuse_max_seq_len - else: - raise ValueError( - "Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument" - " in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support." - ) - return current_fused_mapping - - def post_init_awq_exllama_modules(model, exllama_config): """ Runs post init for Exllama layers which performs: From 13191b9fda83a70fc20b1f94d28e195692631ed9 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 16:27:30 +0800 Subject: [PATCH 16/34] mod require_auto_awq -> require_gptqmodel --- src/transformers/testing_utils.py | 9 +-------- tests/quantization/autoawq/test_awq.py | 8 ++++---- tests/quantization/gptq/test_gptq.py | 8 ++++---- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index c8876bf04597..37dcbf7d01a2 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -1284,7 +1284,7 @@ def require_tensorboard(test_case): return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard") -def require_gptq(test_case): +def require_gptqmodel(test_case): """ Decorator for gptqmodel dependency """ @@ -1298,13 +1298,6 @@ def require_hqq(test_case): return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case) -def require_auto_awq(test_case): - """ - Decorator for auto_awq dependency - """ - return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case) - - def require_auto_round(test_case): """ Decorator for auto_round dependency diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index aef41a2fa3ff..834220c2379c 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -23,7 +23,7 @@ backend_empty_cache, get_device_properties, require_accelerate, - require_auto_awq, + require_gptqmodel, require_flash_attn, require_intel_extension_for_pytorch, require_torch_accelerator, @@ -102,7 +102,7 @@ def test_from_dict(self): @slow @require_torch_accelerator -@require_auto_awq +@require_gptqmodel @require_accelerate class AwqTest(unittest.TestCase): model_name = "TheBloke/Mistral-7B-v0.1-AWQ" @@ -289,7 +289,7 @@ def test_quantized_model_no_k_proj_quantized(self): @slow @require_torch_accelerator -@require_auto_awq +@require_gptqmodel @require_accelerate class AwqScaleTest(unittest.TestCase): model_name = "TechxGenus/starcoder2-3b-AWQ" @@ -309,7 +309,7 @@ def test_load_quantized_model(self): @slow -@require_auto_awq +@require_gptqmodel @require_accelerate @require_intel_extension_for_pytorch class AwqIPEXTest(unittest.TestCase): diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py index a51850f31aa4..5aa8aa9f790e 100644 --- a/tests/quantization/gptq/test_gptq.py +++ b/tests/quantization/gptq/test_gptq.py @@ -21,7 +21,7 @@ from transformers.testing_utils import ( is_torch_available, require_accelerate, - require_gptq, + require_gptqmodel, require_optimum, require_torch_gpu, require_torch_multi_gpu, @@ -76,7 +76,7 @@ def test_optimum_config(self): @slow @require_optimum -@require_gptq +@require_gptqmodel class GPTQTest(unittest.TestCase): model_name = "bigscience/bloom-560m" @@ -295,7 +295,7 @@ class GPTQTestDeviceMapExllama(GPTQTestCUDA): @slow @require_optimum -@require_gptq +@require_gptqmodel @require_torch_gpu @require_accelerate class GPTQTestActOrderExllama(unittest.TestCase): @@ -371,7 +371,7 @@ def test_max_input_length(self): @slow @require_optimum -@require_gptq +@require_gptqmodel @require_torch_gpu @require_accelerate class GPTQTestExllamaV2(unittest.TestCase): From e91e2727b74d85ee364113b4ef41b407490288b7 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 17:22:46 +0800 Subject: [PATCH 17/34] convert vertion to checkpoint_format --- src/transformers/utils/quantization_config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index e531b5d05618..8ace8d601f34 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -793,7 +793,7 @@ def from_dict_optimum(cls, config_dict): @dataclass -class AwqConfig(QuantizationConfigMixin): +class AwqConfig(GPTQConfig): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been loaded using `auto-awq` library awq quantization relying on auto_awq backend. @@ -867,6 +867,9 @@ def post_init(self): raise ValueError( f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}" ) + + # convert vertion to checkpoint_format + self.checkpoint_format = self.version.value if self.backend == AwqBackendPackingMethod.LLMAWQ: # Only cuda device can run this function From dd3037389d3c7a2d501d9b7e94abcd3fc7c9497b Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 17:24:11 +0800 Subject: [PATCH 18/34] check is_gptqmodel_available --- src/transformers/integrations/awq.py | 6 +++--- src/transformers/quantizers/quantizer_awq.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 0a5e82db7a4c..1f6df15fa76e 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -19,7 +19,7 @@ from ..activations import ACT2FN from ..modeling_utils import PreTrainedModel -from ..utils import is_auto_awq_available, is_ipex_available, is_torch_available, logging +from ..utils import is_gptqmodel_available, is_torch_available, logging from ..utils.quantization_config import ( AwqBackendPackingMethod, AwqConfig, @@ -124,9 +124,9 @@ def replace_with_awq_linear( backend = quantization_config.backend - if not is_auto_awq_available(): + if not is_gptqmodel_available(): raise ValueError( - "AWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awq" + "AWQ (either `llmawq`) is not available. Please install it with `pip install gptqmodel` or check out the installation guide in https://github.com/mit-han-lab/llm-awq" ) if backend == AwqBackendPackingMethod.AUTOAWQ: diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index 2dfc5d9c0de7..ad99c635ad9b 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -22,7 +22,7 @@ if TYPE_CHECKING: from ..modeling_utils import PreTrainedModel -from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging +from ..utils import is_accelerate_available, is_gptqmodel_available, is_torch_available, logging from ..utils.quantization_config import AWQLinearVersion @@ -46,8 +46,8 @@ def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) def validate_environment(self, device_map, **kwargs): - if not is_auto_awq_available(): - raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)") + if not is_gptqmodel_available(): + raise ImportError("Loading an AWQ quantized model requires gptqmodel library (`pip install gptqmodel`)") if not is_accelerate_available(): raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)") From f7688202a545f22be25dd26373a385a49ccf5065 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 17:26:13 +0800 Subject: [PATCH 19/34] revert modules_to_not_convert --- src/transformers/utils/quantization_config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 8ace8d601f34..0cbf056791e0 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -835,6 +835,7 @@ def __init__( version: AWQLinearVersion = AWQLinearVersion.GEMM, backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ, exllama_config: dict[str, int] | None = None, + modules_to_not_convert: list | None = None, **kwargs, ): self.quant_method = QuantizationMethod.AWQ @@ -845,6 +846,7 @@ def __init__( self.version = version self.backend = backend self.exllama_config = exllama_config + self.modules_to_not_convert = modules_to_not_convert self.post_init() From 94f91340621ca5ad3d5fbd89ea5eb46ee129877c Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Thu, 20 Nov 2025 17:38:20 +0800 Subject: [PATCH 20/34] pass bits, sym, desc_act --- src/transformers/integrations/awq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 1f6df15fa76e..f6b65f8368b4 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -172,7 +172,9 @@ def replace_with_awq_linear( out_features = module.out_features model._modules[name] = target_cls( - w_bit=quantization_config.bits, + bits=quantization_config.bits, + sym=quantization_config.sym, + desc_act=quantization_config.desc_act, group_size=quantization_config.group_size, in_features=in_features, out_features=out_features, From c14413a0e929e8a203d690d5c24a357680f7767b Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 10:09:39 +0800 Subject: [PATCH 21/34] fix awqconfig init --- src/transformers/integrations/awq.py | 20 +++++++++---------- src/transformers/utils/quantization_config.py | 8 +++----- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index f6b65f8368b4..8e9c7f3135b4 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -131,28 +131,28 @@ def replace_with_awq_linear( if backend == AwqBackendPackingMethod.AUTOAWQ: if quantization_config.version == AWQLinearVersion.GEMM: - from gptqmodel.nn_modules.qlinear.awq_gemm import AwqGEMMQuantLinear + from gptqmodel.quantization.awq.modules.linear.gemm import WQLinear_GEMM - target_cls = AwqGEMMQuantLinear + target_cls = WQLinear_GEMM elif quantization_config.version == AWQLinearVersion.GEMV: - from gptqmodel.nn_modules.qlinear.awq_gemv import AwqGEMVQuantLinear + from gptqmodel.quantization.awq.modules.linear.gemv import WQLinear_GEMV - target_cls = AwqGEMVQuantLinear + target_cls = WQLinear_GEMV elif quantization_config.version == AWQLinearVersion.EXLLAMA: if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear + from gptqmodel.quantization.awq.modules.linear.exllama import WQLinear_Exllama - target_cls = AwqExllamaQuantLinear + target_cls = WQLinear_Exllama elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear + from gptqmodel.quantization.awq.modules.linear.exllamav2 import WQLinear_ExllamaV2 - target_cls = AwqExllamaV2QuantLinear + target_cls = WQLinear_ExllamaV2 else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: - from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear + from gptqmodel.quantization.awq.modules.linear.gemm_ipex import WQLinear_IPEX - target_cls = TorchFusedAwqQuantLinear + target_cls = WQLinear_IPEX else: raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}") else: diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 0cbf056791e0..574865988f85 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -838,17 +838,15 @@ def __init__( modules_to_not_convert: list | None = None, **kwargs, ): - self.quant_method = QuantizationMethod.AWQ - self.bits = bits - self.group_size = group_size self.zero_point = zero_point self.version = version - self.backend = backend self.exllama_config = exllama_config self.modules_to_not_convert = modules_to_not_convert - self.post_init() + super().__init__(bits=bits, group_size=group_size, backend=backend, **kwargs) + self.quant_method = QuantizationMethod.AWQ + def post_init(self): r""" From 27ec7b4887306e09afa5602b1f0797369289d147 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 10:11:32 +0800 Subject: [PATCH 22/34] fix wrong args --- src/transformers/integrations/awq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 8e9c7f3135b4..7bc9f3644aef 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -172,9 +172,7 @@ def replace_with_awq_linear( out_features = module.out_features model._modules[name] = target_cls( - bits=quantization_config.bits, - sym=quantization_config.sym, - desc_act=quantization_config.desc_act, + w_bit=quantization_config.bits, group_size=quantization_config.group_size, in_features=in_features, out_features=out_features, From 820c694a83f9777a4aeb5e86002d27c456eb5baf Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 10:44:23 +0800 Subject: [PATCH 23/34] fix ipex --- src/transformers/integrations/awq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 7bc9f3644aef..3fa6a9b47082 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -150,9 +150,9 @@ def replace_with_awq_linear( else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: - from gptqmodel.quantization.awq.modules.linear.gemm_ipex import WQLinear_IPEX + from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear - target_cls = WQLinear_IPEX + target_cls = TorchFusedAwqQuantLinear else: raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}") else: From f80ed50b794fc604aae452058aa3954f80eb0399 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 10:52:53 +0800 Subject: [PATCH 24/34] mod ipex version check --- src/transformers/quantizers/quantizer_awq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index ad99c635ad9b..95116c02e70b 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -61,9 +61,9 @@ def validate_environment(self, device_map, **kwargs): self.quantization_config.version = AWQLinearVersion.IPEX if self.quantization_config.version == AWQLinearVersion.IPEX: - if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"): + if version.parse(importlib.metadata.version("gptqmodel")) < version.parse("5.0.0"): raise RuntimeError( - "To use IPEX backend, you need autoawq>0.2.6. Please install the latest version or from source." + "To use IPEX backend, you need gptqmodel>5.0.0. Please install the latest version or from source." ) if device_map is None: logger.warning_once( From d40400548fbb6e3fc1dc2d5911853c06c835484f Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 10:53:15 +0800 Subject: [PATCH 25/34] cleanup --- tests/quantization/autoawq/test_awq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 834220c2379c..56ff842629db 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -311,7 +311,6 @@ def test_load_quantized_model(self): @slow @require_gptqmodel @require_accelerate -@require_intel_extension_for_pytorch class AwqIPEXTest(unittest.TestCase): def test_quantized_model_ipex(self): """ From c86ac340d5ec5f2c771bbfc307660c26caf9cc87 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 14:38:54 +0800 Subject: [PATCH 26/34] fix awq_linear --- src/transformers/integrations/awq.py | 21 ++++++++++++--------- tests/quantization/autoawq/test_awq.py | 8 ++++---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 3fa6a9b47082..73067ec3ec12 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -131,22 +131,22 @@ def replace_with_awq_linear( if backend == AwqBackendPackingMethod.AUTOAWQ: if quantization_config.version == AWQLinearVersion.GEMM: - from gptqmodel.quantization.awq.modules.linear.gemm import WQLinear_GEMM + from gptqmodel.nn_modules.qlinear.awq_gemm import AwqGEMMQuantLinear - target_cls = WQLinear_GEMM + target_cls = AwqGEMMQuantLinear elif quantization_config.version == AWQLinearVersion.GEMV: - from gptqmodel.quantization.awq.modules.linear.gemv import WQLinear_GEMV + from gptqmodel.nn_modules.qlinear.awq_gemv import AwqGEMVQuantLinear - target_cls = WQLinear_GEMV + target_cls = AwqGEMVQuantLinear elif quantization_config.version == AWQLinearVersion.EXLLAMA: if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.quantization.awq.modules.linear.exllama import WQLinear_Exllama + from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear - target_cls = WQLinear_Exllama + target_cls = AwqExllamaQuantLinear elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.quantization.awq.modules.linear.exllamav2 import WQLinear_ExllamaV2 + from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear - target_cls = WQLinear_ExllamaV2 + target_cls = AwqExllamaV2QuantLinear else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: @@ -172,12 +172,15 @@ def replace_with_awq_linear( out_features = module.out_features model._modules[name] = target_cls( - w_bit=quantization_config.bits, + bits=quantization_config.bits, + sym=quantization_config.sym, + desc_act=quantization_config.desc_act, group_size=quantization_config.group_size, in_features=in_features, out_features=out_features, bias=module.bias is not None, dev=module.weight.device, + register_buffers=True, ) has_been_replaced = True diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 56ff842629db..fd31e7fcf6a5 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -150,8 +150,8 @@ def test_quantized_model_conversion(self): """ Simple test that checks if the quantized model has been converted properly """ - from gptqmodel.quantization.awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV - + from gptqmodel.nn_modules.qlinear.awq_gemm import AwqGEMMQuantLinear + from gptqmodel.nn_modules.qlinear.awq_gemv import AwqGEMVQuantLinear from transformers.integrations.awq import replace_with_awq_linear model_id = "facebook/opt-350m" @@ -169,7 +169,7 @@ def test_quantized_model_conversion(self): model, _ = replace_with_awq_linear(model, quantization_config=quantization_config) nb_awq_linear = 0 for module in model.modules(): - if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)): + if isinstance(module, (AwqGEMMQuantLinear, AwqGEMVQuantLinear)): nb_awq_linear += 1 self.assertEqual(nb_linears, nb_awq_linear) @@ -183,7 +183,7 @@ def test_quantized_model_conversion(self): ) nb_awq_linear = 0 for module in model.modules(): - if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)): + if isinstance(module, (AwqGEMMQuantLinear, AwqGEMVQuantLinear)): nb_awq_linear += 1 self.assertEqual(nb_linears - 1, nb_awq_linear) From 8bae9867fbca42f9a6902cbee4e12192e5626d79 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 16:27:06 +0800 Subject: [PATCH 27/34] remove self.exllama_config = exllama_config --- src/transformers/utils/quantization_config.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 574865988f85..970068bede4b 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -841,7 +841,6 @@ def __init__( self.zero_point = zero_point self.version = version - self.exllama_config = exllama_config self.modules_to_not_convert = modules_to_not_convert super().__init__(bits=bits, group_size=group_size, backend=backend, **kwargs) @@ -894,17 +893,7 @@ def post_init(self): f"You current version of `gptqmodel` does not support awq, " f"please upgrade `gptqmodel` package to at least {MIN_GPTQMODEL_SUPPORT_AWQ_VERSION}." ) - - if self.exllama_config is None: - self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8} - else: - if "version" not in self.exllama_config: - raise ValueError("`exllama_config` needs to have a `version` key.") - elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: - exllama_version = self.exllama_config["version"] - raise ValueError( - f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}" - ) + def get_loading_attributes(self): attributes_dict = copy.deepcopy(self.__dict__) From 90019c6fc4f7a617ed9db482a42ecd1cd07f9108 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 16:45:26 +0800 Subject: [PATCH 28/34] cleanuo --- src/transformers/integrations/awq.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 73067ec3ec12..29bae0130d29 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -139,16 +139,9 @@ def replace_with_awq_linear( target_cls = AwqGEMVQuantLinear elif quantization_config.version == AWQLinearVersion.EXLLAMA: - if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear + from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear - target_cls = AwqExllamaQuantLinear - elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear - - target_cls = AwqExllamaV2QuantLinear - else: - raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") + target_cls = AwqExllamaV2QuantLinear elif quantization_config.version == AWQLinearVersion.IPEX: from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear From 6a4865cb0ddb239fb28672ab5cb65ea9992c9572 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 16:56:53 +0800 Subject: [PATCH 29/34] Revert "cleanuo" This reverts commit 90019c6fc4f7a617ed9db482a42ecd1cd07f9108. --- src/transformers/integrations/awq.py | 11 +++++++++-- src/transformers/utils/quantization_config.py | 13 ++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 29bae0130d29..73067ec3ec12 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -139,9 +139,16 @@ def replace_with_awq_linear( target_cls = AwqGEMVQuantLinear elif quantization_config.version == AWQLinearVersion.EXLLAMA: - from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear + if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: + from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear - target_cls = AwqExllamaV2QuantLinear + target_cls = AwqExllamaQuantLinear + elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: + from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear + + target_cls = AwqExllamaV2QuantLinear + else: + raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") elif quantization_config.version == AWQLinearVersion.IPEX: from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 970068bede4b..574865988f85 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -841,6 +841,7 @@ def __init__( self.zero_point = zero_point self.version = version + self.exllama_config = exllama_config self.modules_to_not_convert = modules_to_not_convert super().__init__(bits=bits, group_size=group_size, backend=backend, **kwargs) @@ -893,7 +894,17 @@ def post_init(self): f"You current version of `gptqmodel` does not support awq, " f"please upgrade `gptqmodel` package to at least {MIN_GPTQMODEL_SUPPORT_AWQ_VERSION}." ) - + + if self.exllama_config is None: + self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8} + else: + if "version" not in self.exllama_config: + raise ValueError("`exllama_config` needs to have a `version` key.") + elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: + exllama_version = self.exllama_config["version"] + raise ValueError( + f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}" + ) def get_loading_attributes(self): attributes_dict = copy.deepcopy(self.__dict__) From 1238c3ba213ddc4ee0c64588afed25790916456b Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 18:06:26 +0800 Subject: [PATCH 30/34] update is_trainable --- src/transformers/quantizers/quantizer_awq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index 95116c02e70b..f056cbc62233 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -148,6 +148,4 @@ def is_serializable(self, safe_serialization=None): @property def is_trainable(self): - # AWQ supports PEFT fine-tuning from version 0.2.0 - MIN_AWQ_VERSION_FOR_PEFT = "0.2.0" - return version.parse(importlib.metadata.version("autoawq")) >= version.parse(MIN_AWQ_VERSION_FOR_PEFT) + return version.parse(importlib.metadata.version("gptqmodel")) >= version.parse(5.0.0) From 26d1f0ff90ede2d54d192dbbdea8ae3674188aba Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Fri, 21 Nov 2025 18:07:05 +0800 Subject: [PATCH 31/34] cleanup --- src/transformers/quantizers/quantizer_awq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index f056cbc62233..a6bf4ae4e29f 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -148,4 +148,4 @@ def is_serializable(self, safe_serialization=None): @property def is_trainable(self): - return version.parse(importlib.metadata.version("gptqmodel")) >= version.parse(5.0.0) + return version.parse(importlib.metadata.version("gptqmodel")) >= version.parse("5.0.0") From b2ae0d56886599525ab774ce551216e1ea7bae0c Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 22 Nov 2025 08:28:59 +0000 Subject: [PATCH 32/34] remove fused --- src/transformers/integrations/awq.py | 85 ---------------------------- 1 file changed, 85 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index f5b4957fdf94..916d5b123b25 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -35,44 +35,6 @@ logger = logging.get_logger(__name__) -AWQ_FUSED_MAPPINGS = { - "mistral": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, - "mixtral": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["w1", "w3", "w2"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, - "llama": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, - "llava": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, - "qwen2": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, - "qwen3": { - "attention": ["q_proj", "k_proj", "v_proj", "o_proj", "q_norm", "k_norm"], - "mlp": ["gate_proj", "up_proj", "down_proj"], - "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], - "use_alibi": False, - }, -} AWQ_SCALES_MAPPINGS = { "starcoder2": {"act": "act", "layer_before_act": "c_fc"}, @@ -86,53 +48,6 @@ } -if is_auto_awq_available(): - from awq.modules.fused.attn import RoPE - - class AWQRoPE(RoPE): - """ - AWQRoPE module for hacking rope implementation in AWQ fused attention modules to support more models. - - Args: - rope_type (`str`): - The rope type to use. - head_dim (`int`): - The head dimension. - max_seq_len (`int`): - The maximum sequence length. - config (`PreTrainedConfig`): - The model config object. - device (`torch.device`): - The device to put the module on. - """ - - def __init__(self, rope_type, head_dim, max_seq_len, config, device): - rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type] - self.inv_freq, self.attention_scaling = rope_init_fn(config, device) - # Use fake rope_theta to initialize the parent class - super().__init__(head_dim=head_dim, max_seq_len=max_seq_len, device=device, rope_theta=-1) - - def precompute_freqs_cis(self, dim: int, end: int, theta=-1): - t = torch.arange(end, device=self.inv_freq.device) - freqs = torch.outer(t, self.inv_freq).float() - freqs_cis = torch.polar(torch.ones_like(freqs), freqs) - del self.inv_freq # free the memory - return freqs_cis - - def forward( - self, - xq: torch.Tensor, - xk: torch.Tensor, - start_pos: int, - seqlen: int, - partial: bool = False, - ): - xq_out, xk_out = super().forward(xq, xk, start_pos, seqlen, partial) - xq_out = (xq_out * self.attention_scaling).type_as(xq) - xk_out = (xk_out * self.attention_scaling).type_as(xk) - return xq_out, xk_out - - def replace_quantization_scales(model, model_type): from gptqmodel.quantization.awq.modules.act import ScaledActivation From 92eba4e81a817dc4670d574e800c8db0c97cfcfe Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 26 Nov 2025 04:38:57 +0000 Subject: [PATCH 33/34] call hf_select_quant_linear_v2() Signed-off-by: ZX-ModelCloud --- src/transformers/integrations/awq.py | 75 +++++++++---------- src/transformers/utils/__init__.py | 2 +- src/transformers/utils/import_utils.py | 2 +- src/transformers/utils/quantization_config.py | 14 ++-- 4 files changed, 45 insertions(+), 48 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 916d5b123b25..9d37a0f64c0c 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -20,7 +20,7 @@ from ..activations import ACT2FN from ..modeling_rope_utils import ROPE_INIT_FUNCTIONS from ..modeling_utils import PreTrainedModel -from ..utils import is_gptqmodel_available, is_torch_available, logging +from ..utils import is_gptqmodel_available, is_llm_awq_available, is_torch_available, logging from ..utils.quantization_config import ( AwqBackendPackingMethod, AwqConfig, @@ -98,37 +98,24 @@ def replace_with_awq_linear( backend = quantization_config.backend - if not is_gptqmodel_available(): + if not is_gptqmodel_available() and not is_llm_awq_available(): raise ValueError( "AWQ (either `llmawq`) is not available. Please install it with `pip install gptqmodel` or check out the installation guide in https://github.com/mit-han-lab/llm-awq" ) - if backend == AwqBackendPackingMethod.AUTOAWQ: - if quantization_config.version == AWQLinearVersion.GEMM: - from gptqmodel.nn_modules.qlinear.awq_gemm import AwqGEMMQuantLinear - - target_cls = AwqGEMMQuantLinear - elif quantization_config.version == AWQLinearVersion.GEMV: - from gptqmodel.nn_modules.qlinear.awq_gemv import AwqGEMVQuantLinear - - target_cls = AwqGEMVQuantLinear - elif quantization_config.version == AWQLinearVersion.EXLLAMA: - if quantization_config.exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.nn_modules.qlinear.awq_exllama import AwqExllamaQuantLinear - - target_cls = AwqExllamaQuantLinear - elif quantization_config.exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.nn_modules.qlinear.awq_exllamav2 import AwqExllamaV2QuantLinear - - target_cls = AwqExllamaV2QuantLinear - else: - raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") - elif quantization_config.version == AWQLinearVersion.IPEX: - from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear - - target_cls = TorchFusedAwqQuantLinear - else: - raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}") + if backend == AwqBackendPackingMethod.GPTQMODEL: + from gptqmodel.utils.importer import hf_select_quant_linear_v2 + from gptqmodel.quantization import METHOD + target_cls = hf_select_quant_linear_v2( + bits=quantization_config.bits, + group_size=quantization_config.group_size, + desc_act=False, + sym=False, + format=quantization_config.format, + quant_method=METHOD.AWQ, + zero_point=quantization_config.zero_point, + pack=False, + ) else: from awq.quantize.qmodule import WQLinear @@ -145,17 +132,27 @@ def replace_with_awq_linear( in_features = module.in_features out_features = module.out_features - model._modules[name] = target_cls( - bits=quantization_config.bits, - sym=quantization_config.sym, - desc_act=quantization_config.desc_act, - group_size=quantization_config.group_size, - in_features=in_features, - out_features=out_features, - bias=module.bias is not None, - dev=module.weight.device, - register_buffers=True, - ) + if backend == AwqBackendPackingMethod.GPTQMODEL: + model._modules[name] = target_cls( + bits=quantization_config.bits, + sym=quantization_config.sym, + desc_act=quantization_config.desc_act, + group_size=quantization_config.group_size, + in_features=in_features, + out_features=out_features, + bias=module.bias is not None, + dev=module.weight.device, + register_buffers=True, + ) + else: + model._modules[name] = target_cls( + w_bit=quantization_config.bits, + group_size=quantization_config.group_size, + in_features=in_features, + out_features=out_features, + bias=module.bias is not None, + dev=module.weight.device, + ) has_been_replaced = True # Force requires grad to False to avoid unexpected errors diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index e225716d9c33..926527d1a2d5 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -122,7 +122,7 @@ is_apex_available, is_apollo_torch_available, is_aqlm_available, - is_auto_awq_available, + is_llm_awq_available, is_auto_round_available, is_av_available, is_bitsandbytes_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 7ba71132f7a1..805f0d8b9f04 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -950,7 +950,7 @@ def is_optimum_available() -> bool: @lru_cache -def is_auto_awq_available() -> bool: +def is_llm_awq_available() -> bool: return _is_package_available("awq") diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 574865988f85..05e65bef0984 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -30,7 +30,6 @@ from transformers.utils.import_utils import is_gptqmodel_available from ..utils import ( - is_auto_awq_available, is_compressed_tensors_available, is_hqq_available, is_quark_available, @@ -90,7 +89,7 @@ def from_str(version: str): class AwqBackendPackingMethod(str, Enum): - AUTOAWQ = "autoawq" + GPTQMODEL = "gptqmodel" LLMAWQ = "llm-awq" @@ -717,6 +716,7 @@ def __init__( self.sym = sym self.true_sequential = true_sequential self.checkpoint_format = checkpoint_format.lower() + self.format = self.checkpoint_format self.meta = meta self.backend = backend.lower() if isinstance(backend, str) else backend self.model_seqlen = model_seqlen @@ -808,7 +808,7 @@ class AwqConfig(GPTQConfig): version (`AWQLinearVersion`, *optional*, defaults to `AWQLinearVersion.GEMM`): The version of the quantization algorithm to use. GEMM is better for big batch_size (e.g. >= 8) otherwise, GEMV is better (e.g. < 8 ). GEMM models are compatible with Exllama kernels. - backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.AUTOAWQ`): + backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.GPTQMODEL`): The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users that quantize their own models using `llm-awq` library. do_fuse (`bool`, *optional*, defaults to `False`): @@ -833,7 +833,7 @@ def __init__( group_size: int = 128, zero_point: bool = True, version: AWQLinearVersion = AWQLinearVersion.GEMM, - backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ, + backend: AwqBackendPackingMethod = AwqBackendPackingMethod.GPTQMODEL, exllama_config: dict[str, int] | None = None, modules_to_not_convert: list | None = None, **kwargs, @@ -844,7 +844,7 @@ def __init__( self.exllama_config = exllama_config self.modules_to_not_convert = modules_to_not_convert - super().__init__(bits=bits, group_size=group_size, backend=backend, **kwargs) + super().__init__(bits=bits, group_size=group_size, backend=backend, checkpoint_format=self.version, **kwargs) self.quant_method = QuantizationMethod.AWQ @@ -852,9 +852,9 @@ def post_init(self): r""" Safety checker that arguments are correct """ - if self.backend not in [AwqBackendPackingMethod.AUTOAWQ, AwqBackendPackingMethod.LLMAWQ]: + if self.backend not in [AwqBackendPackingMethod.GPTQMODEL, AwqBackendPackingMethod.LLMAWQ]: raise ValueError( - f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}" + f"Only supported quantization backends in {AwqBackendPackingMethod.GPTQMODEL} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}" ) self.version = AWQLinearVersion.from_str(self.version) From 5e567ec02f78b1bc3f5785579fded59ffb4204c2 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 26 Nov 2025 07:45:37 +0000 Subject: [PATCH 34/34] Remove the "version" field from AwqConfig Signed-off-by: ZX-ModelCloud --- src/transformers/integrations/awq.py | 34 +----- src/transformers/quantizers/quantizer_awq.py | 64 ++-------- src/transformers/utils/quantization_config.py | 109 +++++------------- 3 files changed, 44 insertions(+), 163 deletions(-) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 9d37a0f64c0c..59dac943c02c 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -22,10 +22,8 @@ from ..modeling_utils import PreTrainedModel from ..utils import is_gptqmodel_available, is_llm_awq_available, is_torch_available, logging from ..utils.quantization_config import ( - AwqBackendPackingMethod, AwqConfig, - AWQLinearVersion, - ExllamaVersion, + AwqBackend, ) @@ -103,7 +101,7 @@ def replace_with_awq_linear( "AWQ (either `llmawq`) is not available. Please install it with `pip install gptqmodel` or check out the installation guide in https://github.com/mit-han-lab/llm-awq" ) - if backend == AwqBackendPackingMethod.GPTQMODEL: + if backend != AwqBackend.LLMAWQ: from gptqmodel.utils.importer import hf_select_quant_linear_v2 from gptqmodel.quantization import METHOD target_cls = hf_select_quant_linear_v2( @@ -112,6 +110,7 @@ def replace_with_awq_linear( desc_act=False, sym=False, format=quantization_config.format, + backend=quantization_config.backend, quant_method=METHOD.AWQ, zero_point=quantization_config.zero_point, pack=False, @@ -132,7 +131,7 @@ def replace_with_awq_linear( in_features = module.in_features out_features = module.out_features - if backend == AwqBackendPackingMethod.GPTQMODEL: + if backend != AwqBackend.LLMAWQ: model._modules[name] = target_cls( bits=quantization_config.bits, sym=quantization_config.sym, @@ -170,31 +169,6 @@ def replace_with_awq_linear( return model, has_been_replaced -def post_init_awq_exllama_modules(model, exllama_config): - """ - Runs post init for Exllama layers which performs: - - Weights unpacking, reordering and repacking - - Devices scratch space allocation - """ - - if exllama_config["version"] == ExllamaVersion.ONE: - from gptqmodel.quantization.awq.modules.linear.exllama import exllama_post_init - - model = exllama_post_init(model) - elif exllama_config["version"] == ExllamaVersion.TWO: - from gptqmodel.quantization.awq.modules.linear.exllamav2 import exllamav2_post_init - - model = exllamav2_post_init( - model, - max_input_len=exllama_config["max_input_len"], - max_batch_size=exllama_config["max_batch_size"], - ) - else: - raise ValueError(f"Unrecognized Exllama version: {exllama_config['version']}") - - return model - - def post_init_awq_ipex_modules(model): """ Runs post init for IPEX layers which performs: diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index a6bf4ae4e29f..f7779365cd53 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -23,8 +23,7 @@ from ..modeling_utils import PreTrainedModel from ..utils import is_accelerate_available, is_gptqmodel_available, is_torch_available, logging -from ..utils.quantization_config import AWQLinearVersion - +from ..utils.quantization_config import AwqBackend if is_torch_available(): import torch @@ -40,7 +39,7 @@ class AwqQuantizer(HfQuantizer): # AWQ requires data calibration - we support only inference requires_calibration = True - required_packages = ["awq", "accelerate"] + required_packages = ["gptqmodel", "awq", "accelerate"] def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) @@ -52,48 +51,6 @@ def validate_environment(self, device_map, **kwargs): if not is_accelerate_available(): raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)") - if ( - self.quantization_config.version == AWQLinearVersion.GEMM - and not torch.cuda.is_available() - and not torch.xpu.is_available() - ): - logger.warning_once("No CUDA or XPU found, consider switching to the IPEX version for CPU-only execution.") - self.quantization_config.version = AWQLinearVersion.IPEX - - if self.quantization_config.version == AWQLinearVersion.IPEX: - if version.parse(importlib.metadata.version("gptqmodel")) < version.parse("5.0.0"): - raise RuntimeError( - "To use IPEX backend, you need gptqmodel>5.0.0. Please install the latest version or from source." - ) - if device_map is None: - logger.warning_once( - "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'" - ) - elif isinstance(device_map, dict) and "disk" in device_map.values(): - raise ValueError( - "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device." - " This is not supported. Please make sure only cpu and xpu in the device_map." - ) - else: - if not torch.cuda.is_available() and not torch.xpu.is_available(): - raise RuntimeError( - "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU" - ) - - if device_map is None: - logger.warning_once( - "You have loaded an AWQ model on CPU and have a CUDA/XPU device available, make sure to set " - "your model on a GPU device in order to run your model." - ) - elif device_map is not None: - if isinstance(device_map, dict) and any( - forbidden in device_map.values() for forbidden in ("cpu", torch.device("cpu"), "disk") - ): - raise ValueError( - "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device." - " This is not supported. Please remove the CPU or disk device from the device_map." - ) - def update_dtype(self, dtype): if dtype is None: dtype = torch.float16 @@ -129,18 +86,17 @@ def _process_model_before_weight_loading( ) def _process_model_after_weight_loading(self, model, **kwargs): - if self.quantization_config.version == AWQLinearVersion.EXLLAMA: - from ..integrations import post_init_awq_exllama_modules - - model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config) - - if self.quantization_config.version == AWQLinearVersion.IPEX: - from ..integrations import post_init_awq_ipex_modules + if self.quantization_config.backend in [AwqBackend.EXLLAMA_V1, AwqBackend.EXLLAMA_V2]: + from gptqmodel.utils.model import hf_gptqmodel_post_init + model = hf_gptqmodel_post_init(model, use_act_order=self.quantization_config.desc_act) - model = post_init_awq_ipex_modules(model) + # if self.quantization_config.version == AWQLinearVersion.IPEX: + # from ..integrations import post_init_awq_ipex_modules + # + # model = post_init_awq_ipex_modules(model) def is_serializable(self, safe_serialization=None): - if self.quantization_config.version == AWQLinearVersion.EXLLAMA: + if self.quantization_config.backend in [AwqBackend.EXLLAMA_V1, AwqBackend.EXLLAMA_V2]: logger.warning("You cannot save an AWQ model that uses Exllama backend!") return False diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 05e65bef0984..92718b3ea8ad 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -66,30 +66,23 @@ class QuantizationMethod(str, Enum): AUTOROUND = "auto-round" MXFP4 = "mxfp4" - -class AWQLinearVersion(str, Enum): +class AwqFormat(str, Enum): GEMM = "gemm" GEMV = "gemv" - EXLLAMA = "exllama" - IPEX = "ipex" - - @staticmethod - def from_str(version: str): - version = version.lower() - if version == "gemm": - return AWQLinearVersion.GEMM - elif version == "gemv": - return AWQLinearVersion.GEMV - elif version == "exllama": - return AWQLinearVersion.EXLLAMA - elif version == "ipex": - return AWQLinearVersion.IPEX - else: - raise ValueError(f"Unknown AWQLinearVersion {version}") - - -class AwqBackendPackingMethod(str, Enum): - GPTQMODEL = "gptqmodel" + GEMV_FAST = "gemv_fast" + +class AwqBackend(str, Enum): + AUTO = "auto" + MACHETE = "machete" + MARLIN = "marlin" + EXLLAMA_V2 = "exllama_v2" + EXLLAMA_V1 = "exllama_v1" + GEMM = "gemm" + GEMM_TRITON = "gemm_triton" + GEMV = "gemv" + GEMV_FAST = "gemv_fast" + TORCH_AWQ = "torch_awq" + TORCH_FUSED_AWQ = "torch_fused_awq" LLMAWQ = "llm-awq" @@ -649,7 +642,7 @@ class GPTQConfig(QuantizationConfigMixin): Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers. - checkpoint_format (`str`, *optional*, defaults to `"gptq"`): + format (`str`, *optional*, defaults to `"gptq"`): GPTQ weight format. `gptq` (v1) is supported by gptqmodel. `gptq_v2` is gptqmodel only. meta (`dict[str, any]`, *optional*): Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. @@ -692,7 +685,7 @@ def __init__( act_group_aware: bool = True, sym: bool = True, true_sequential: bool = True, - checkpoint_format: str = "gptq", + format: str = "gptq", meta: Optional[dict[str, Any]] = None, backend: Optional[str] = None, model_seqlen: Optional[int] = None, @@ -715,8 +708,7 @@ def __init__( self.act_group_aware = act_group_aware self.sym = sym self.true_sequential = true_sequential - self.checkpoint_format = checkpoint_format.lower() - self.format = self.checkpoint_format + self.format = format.lower() self.meta = meta self.backend = backend.lower() if isinstance(backend, str) else backend self.model_seqlen = model_seqlen @@ -821,10 +813,6 @@ class AwqConfig(GPTQConfig): The list of modules to not quantize, useful for quantizing models that explicitly require to have some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). Note you cannot quantize directly with transformers, please refer to `AutoAWQ` documentation for quantizing HF models. - exllama_config (`dict[str, Any]`, *optional*): - You can specify the version of the exllama kernel through the `version` key, the maximum sequence - length through the `max_input_len` key, and the maximum batch size through the `max_batch_size` key. - Defaults to `{"version": 2, "max_input_len": 2048, "max_batch_size": 8}` if unset. """ def __init__( @@ -832,19 +820,17 @@ def __init__( bits: int = 4, group_size: int = 128, zero_point: bool = True, - version: AWQLinearVersion = AWQLinearVersion.GEMM, - backend: AwqBackendPackingMethod = AwqBackendPackingMethod.GPTQMODEL, - exllama_config: dict[str, int] | None = None, + backend: AwqBackend = AwqBackend.AUTO, modules_to_not_convert: list | None = None, **kwargs, ): - + format = AwqFormat.GEMM + if kwargs.get("version") is not None: + format = kwargs.pop("version") self.zero_point = zero_point - self.version = version - self.exllama_config = exllama_config self.modules_to_not_convert = modules_to_not_convert - super().__init__(bits=bits, group_size=group_size, backend=backend, checkpoint_format=self.version, **kwargs) + super().__init__(bits=bits, group_size=group_size, backend=backend, format=format, **kwargs) self.quant_method = QuantizationMethod.AWQ @@ -852,26 +838,16 @@ def post_init(self): r""" Safety checker that arguments are correct """ - if self.backend not in [AwqBackendPackingMethod.GPTQMODEL, AwqBackendPackingMethod.LLMAWQ]: - raise ValueError( - f"Only supported quantization backends in {AwqBackendPackingMethod.GPTQMODEL} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}" - ) - - self.version = AWQLinearVersion.from_str(self.version) - if self.version not in [ - AWQLinearVersion.GEMM, - AWQLinearVersion.GEMV, - AWQLinearVersion.EXLLAMA, - AWQLinearVersion.IPEX, + if self.format not in [ + AwqFormat.GEMM, + AwqFormat.GEMV, + AwqFormat.GEMV_FAST, ]: raise ValueError( - f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}" + f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.GEMV_FAST] - not recognized version {self.format}" ) - # convert vertion to checkpoint_format - self.checkpoint_format = self.version.value - - if self.backend == AwqBackendPackingMethod.LLMAWQ: + if self.backend == AwqBackend.LLMAWQ: # Only cuda device can run this function if not (torch.cuda.is_available() or torch.xpu.is_available()): raise ValueError("LLM-AWQ backend is only supported on CUDA and XPU") @@ -881,34 +857,9 @@ def post_init(self): if major < 8: raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0") - if self.version == AWQLinearVersion.EXLLAMA: - gptqmodel_version_supports_awq = False - MIN_GPTQMODEL_SUPPORT_AWQ_VERSION = "5.0.0" - if is_gptqmodel_available(): - gptqmodel_version_supports_awq = version.parse(importlib.metadata.version("gptqmodel")) >= version.parse( - MIN_GPTQMODEL_SUPPORT_AWQ_VERSION - ) - - if not gptqmodel_version_supports_awq: - raise ValueError( - f"You current version of `gptqmodel` does not support awq, " - f"please upgrade `gptqmodel` package to at least {MIN_GPTQMODEL_SUPPORT_AWQ_VERSION}." - ) - - if self.exllama_config is None: - self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8} - else: - if "version" not in self.exllama_config: - raise ValueError("`exllama_config` needs to have a `version` key.") - elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: - exllama_version = self.exllama_config["version"] - raise ValueError( - f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}" - ) - def get_loading_attributes(self): attributes_dict = copy.deepcopy(self.__dict__) - loading_attributes = ["version", "exllama_config"] + loading_attributes = ["version"] loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes} return loading_attributes_dict