Skip to content

Commit e8d64bf

Browse files
Adapt transformers 4.56.0 (#2274)
Signed-off-by: Kaihui-intel <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 40446a0 commit e8d64bf

File tree

6 files changed

+53
-39
lines changed

6 files changed

+53
-39
lines changed

neural_compressor/torch/algorithms/weight_only/save_load.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,10 @@ def _init_hf_model(self, model_class, config):
937937
else: # pragma: no cover
938938
assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
939939

940-
dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
940+
if parse(transformers.__version__) >= parse("4.56.0"):
941+
dtype_orig = model_class._set_default_dtype(torch_dtype)
942+
else:
943+
dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
941944

942945
init_contexts = (
943946
[no_init_weights(_enable=_fast_init)]

neural_compressor/torch/utils/utility.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
if is_transformers_imported():
3939
import transformers
4040

41-
SUPPORTED_LAYERS = [nn.Linear, transformers.modeling_utils.Conv1D]
41+
SUPPORTED_LAYERS = [nn.Linear, transformers.pytorch_utils.Conv1D]
4242
else:
4343
SUPPORTED_LAYERS = [nn.Conv1d, nn.Linear]
4444

neural_compressor/transformers/models/modeling_auto.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
393393
# index of the files.
394394
is_sharded = False
395395
sharded_metadata = None
396-
if transformers.__version__ >= "4.50":
396+
if parse(transformers.__version__) >= parse("4.50"):
397397
from transformers.modeling_utils import _get_resolved_checkpoint_files
398398

399399
gguf_file = kwargs.pop("gguf_file", None)
@@ -635,8 +635,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
635635
torch_dtype = torch.float32
636636
else:
637637
assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
638-
639-
dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
638+
if parse(transformers.__version__) >= parse("4.56.0"):
639+
dtype_orig = model_class._set_default_dtype(torch_dtype)
640+
else:
641+
dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
640642
if quantization_config.compute_dtype is None:
641643
if use_xpu:
642644
quantization_config.compute_dtype = (

neural_compressor/transformers/quantization/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ def set_nontext_module_config(model, to_quant_block_names, config):
579579
set_nontext_module_config(model, to_quant_block_names, config)
580580

581581
for n, m in model.named_modules():
582-
if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
582+
if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D):
583583
if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
584584
config.modules_to_not_convert.append(n)
585585
print(

test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -75,24 +75,18 @@ def test_quantizer_on_llm(self):
7575

7676
model_name = "facebook/opt-125m"
7777
model = AutoModelForCausalLM.from_pretrained(model_name)
78+
model_config = model.config
7879
tokenizer = AutoTokenizer.from_pretrained(model_name)
79-
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
80+
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
8081
# example_inputs = (input_ids,)
8182
# model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
83+
attention_mask = inputs.attention_mask
84+
input_ids = inputs.input_ids
85+
86+
87+
from transformers.integrations.executorch import export_with_dynamic_cache
8288
from transformers import DynamicCache
83-
example_inputs = {
84-
"input_ids": input_ids,
85-
"attention_mask": None,
86-
"past_key_values": DynamicCache(),
87-
"use_cache": True,
88-
}
89-
with torch.no_grad():
90-
ep = torch.export.export_for_training(
91-
model,
92-
(),
93-
example_inputs,
94-
strict=False,
95-
)
89+
ep = export_with_dynamic_cache(model, input_ids, attention_mask)
9690
model = ep.module()
9791
model._exported = True
9892

@@ -102,15 +96,24 @@ def test_quantizer_on_llm(self):
10296
prepare_model = w8a8_static_quantizer.prepare(model)
10397
# calibrate
10498
for i in range(2):
105-
prepare_model(**example_inputs)
99+
prepare_model(
100+
input_ids=input_ids,
101+
attention_mask=attention_mask,
102+
past_key_values=DynamicCache(config=model_config),
103+
use_cache=True,
104+
)
106105
# convert
107106
converted_model = w8a8_static_quantizer.convert(prepare_model)
108107
# inference
109108
from torch._inductor import config
110109

111110
config.freezing = True
112111
opt_model = torch.compile(converted_model)
113-
out = opt_model(**example_inputs)
112+
out = opt_model(input_ids=input_ids,
113+
attention_mask=attention_mask,
114+
past_key_values=DynamicCache(config=model_config),
115+
use_cache=True,
116+
)
114117
assert out.logits is not None
115118

116119
@patch("neural_compressor.torch.algorithms.pt2e_quant.core.logger.error")

test/3x/torch/quantization/test_pt2e_quant.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -206,23 +206,19 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
206206
model_name = "facebook/opt-125m"
207207
model = AutoModelForCausalLM.from_pretrained(model_name)
208208
tokenizer = AutoTokenizer.from_pretrained(model_name)
209-
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
209+
model = AutoModelForCausalLM.from_pretrained(model_name)
210+
model_config = model.config
211+
tokenizer = AutoTokenizer.from_pretrained(model_name)
212+
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
210213
# example_inputs = (input_ids,)
211-
# model = export(model, example_inputs=example_inputs)
214+
# model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
215+
attention_mask = inputs.attention_mask
216+
input_ids = inputs.input_ids
217+
218+
219+
from transformers.integrations.executorch import export_with_dynamic_cache
212220
from transformers import DynamicCache
213-
example_inputs = {
214-
"input_ids": input_ids,
215-
"attention_mask": None,
216-
"past_key_values": DynamicCache(),
217-
"use_cache": True,
218-
}
219-
with torch.no_grad():
220-
ep = torch.export.export_for_training(
221-
model,
222-
(),
223-
example_inputs,
224-
strict=False,
225-
)
221+
ep = export_with_dynamic_cache(model, input_ids, attention_mask)
226222
model = ep.module()
227223
model._exported = True
228224
model.dynamic_shapes = None
@@ -232,15 +228,25 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
232228
prepare_model = prepare(model, quant_config)
233229
# calibrate
234230
for i in range(2):
235-
prepare_model(**example_inputs)
231+
prepare_model(
232+
input_ids=input_ids,
233+
attention_mask=attention_mask,
234+
past_key_values=DynamicCache(config=model_config),
235+
use_cache=True,
236+
)
236237
# convert
237238
converted_model = convert(prepare_model)
238239
# inference
239240
from torch._inductor import config
240241

241242
config.freezing = True
242243
opt_model = torch.compile(converted_model)
243-
out = opt_model(**example_inputs)
244+
out = opt_model(
245+
input_ids=input_ids,
246+
attention_mask=attention_mask,
247+
past_key_values=DynamicCache(config=model_config),
248+
use_cache=True,
249+
)
244250
assert out.logits is not None
245251

246252
@staticmethod

0 commit comments

Comments
 (0)