Adapt transformers 4.56.0 (#2274)

Kaihui-intel · pre-commit-ci[bot] · web-flow · commit e8d64bf3ce26 · 2025-09-03T10:35:08.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -937,7 +937,10 @@ def _init_hf_model(self, model_class, config):
                 else:  # pragma: no cover
                     assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
 
-            dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
+            if parse(transformers.__version__) >= parse("4.56.0"):
+                dtype_orig = model_class._set_default_dtype(torch_dtype)
+            else:
+                dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
 
         init_contexts = (
             [no_init_weights(_enable=_fast_init)]
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -38,7 +38,7 @@
 if is_transformers_imported():
     import transformers
 
-    SUPPORTED_LAYERS = [nn.Linear, transformers.modeling_utils.Conv1D]
+    SUPPORTED_LAYERS = [nn.Linear, transformers.pytorch_utils.Conv1D]
 else:
     SUPPORTED_LAYERS = [nn.Conv1d, nn.Linear]
 
diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py
@@ -393,7 +393,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # index of the files.
         is_sharded = False
         sharded_metadata = None
-        if transformers.__version__ >= "4.50":
+        if parse(transformers.__version__) >= parse("4.50"):
             from transformers.modeling_utils import _get_resolved_checkpoint_files
 
             gguf_file = kwargs.pop("gguf_file", None)
@@ -635,8 +635,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                             torch_dtype = torch.float32
                 else:
                     assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
-
-            dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
+            if parse(transformers.__version__) >= parse("4.56.0"):
+                dtype_orig = model_class._set_default_dtype(torch_dtype)
+            else:
+                dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
         if quantization_config.compute_dtype is None:
             if use_xpu:
                 quantization_config.compute_dtype = (
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
@@ -579,7 +579,7 @@ def set_nontext_module_config(model, to_quant_block_names, config):
             set_nontext_module_config(model, to_quant_block_names, config)
 
             for n, m in model.named_modules():
-                if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+                if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D):
                     if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
                         config.modules_to_not_convert.append(n)
                         print(
diff --git a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py
@@ -75,24 +75,18 @@ def test_quantizer_on_llm(self):
 
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name)
+        model_config = model.config
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+        inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         # example_inputs = (input_ids,)
         # model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
+        attention_mask = inputs.attention_mask
+        input_ids = inputs.input_ids
+
+
+        from transformers.integrations.executorch import export_with_dynamic_cache
         from transformers import DynamicCache
-        example_inputs =                 {
-                    "input_ids": input_ids,
-                    "attention_mask": None,
-                    "past_key_values": DynamicCache(),
-                    "use_cache": True,
-                }
-        with torch.no_grad():
-            ep = torch.export.export_for_training(
-                model,
-                (),
-                example_inputs,
-                strict=False,
-            )
+        ep = export_with_dynamic_cache(model, input_ids, attention_mask)
         model = ep.module()
         model._exported = True
 
@@ -102,15 +96,24 @@ def test_quantizer_on_llm(self):
         prepare_model = w8a8_static_quantizer.prepare(model)
         # calibrate
         for i in range(2):
-            prepare_model(**example_inputs)
+            prepare_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                past_key_values=DynamicCache(config=model_config),
+                use_cache=True,
+            )
         # convert
         converted_model = w8a8_static_quantizer.convert(prepare_model)
         # inference
         from torch._inductor import config
 
         config.freezing = True
         opt_model = torch.compile(converted_model)
-        out = opt_model(**example_inputs)
+        out = opt_model(input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=DynamicCache(config=model_config),
+            use_cache=True,
+            )
         assert out.logits is not None
 
     @patch("neural_compressor.torch.algorithms.pt2e_quant.core.logger.error")
diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
@@ -206,23 +206,19 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        model_config = model.config
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         # example_inputs = (input_ids,)
-        # model = export(model, example_inputs=example_inputs)
+        # model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
+        attention_mask = inputs.attention_mask
+        input_ids = inputs.input_ids
+       
+        
+        from transformers.integrations.executorch import export_with_dynamic_cache
         from transformers import DynamicCache
-        example_inputs =                 {
-                    "input_ids": input_ids,
-                    "attention_mask": None,
-                    "past_key_values": DynamicCache(),
-                    "use_cache": True,
-                }
-        with torch.no_grad():
-            ep = torch.export.export_for_training(
-                model,
-                (),
-                example_inputs,
-                strict=False,
-            )
+        ep = export_with_dynamic_cache(model, input_ids, attention_mask)
         model = ep.module()
         model._exported = True
         model.dynamic_shapes = None
@@ -232,15 +228,25 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
         prepare_model = prepare(model, quant_config)
         # calibrate
         for i in range(2):
-            prepare_model(**example_inputs)
+            prepare_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                past_key_values=DynamicCache(config=model_config),
+                use_cache=True,
+            )
         # convert
         converted_model = convert(prepare_model)
         # inference
         from torch._inductor import config
 
         config.freezing = True
         opt_model = torch.compile(converted_model)
-        out = opt_model(**example_inputs)
+        out = opt_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                past_key_values=DynamicCache(config=model_config),
+                use_cache=True,
+        )
         assert out.logits is not None
 
     @staticmethod