From 0e30c2efc10875c3e667163e1d73e7e6de933468 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 15 May 2025 18:52:12 +0000 Subject: [PATCH 01/12] Upgrading onnx onxrt and onnxscript Signed-off-by: Dipankar Sarkar --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 648d2ce4e..4ca821335 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,11 +28,11 @@ dependencies = [ "multidict==6.0.4", "urllib3<2", "sentencepiece==0.2.0", - "onnx==1.16.0", - "onnxruntime==1.16.3", + "onnx==1.17.0", + "onnxruntime==1.22.0", "numpy==1.26.4", "protobuf==3.20.2", - "onnxscript==0.1.0.dev20240327", + "onnxscript==0.2.5", "pillow===10.4.0", "sympy", "tensorboard", From c7e894bebd8396ed652066eb3594976f07323879 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 15 May 2025 19:48:38 +0000 Subject: [PATCH 02/12] Minor updates Signed-off-by: Dipankar Sarkar --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ca821335..348632da0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,8 @@ dependencies = [ "multidict==6.0.4", "urllib3<2", "sentencepiece==0.2.0", - "onnx==1.17.0", - "onnxruntime==1.22.0", + "onnx==1.18.0", + "onnxruntime==1.22", "numpy==1.26.4", "protobuf==3.20.2", "onnxscript==0.2.5", From 3d78714c8758b891970c65e522aa5d1fdc36f5e4 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 16 May 2025 09:00:23 +0000 Subject: [PATCH 03/12] Updating protobuff Signed-off-by: Dipankar Sarkar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 348632da0..3be995b89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "onnx==1.18.0", "onnxruntime==1.22", "numpy==1.26.4", - "protobuf==3.20.2", + "protobuf==4.25.1", "onnxscript==0.2.5", "pillow===10.4.0", "sympy", From 36124e043812be3f5e1c689c3096d703c301caec Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 21 May 2025 12:15:02 +0000 Subject: [PATCH 04/12] upgrading protobuff to 6.31.0 Signed-off-by: Dipankar Sarkar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3be995b89..7c2b2d072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "onnx==1.18.0", "onnxruntime==1.22", "numpy==1.26.4", - "protobuf==4.25.1", + "protobuf==6.31.0", "onnxscript==0.2.5", "pillow===10.4.0", "sympy", From 8ab8d782b2901e6ac0e0e839207604a6a8e5b3e3 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 22 May 2025 07:00:53 +0000 Subject: [PATCH 05/12] Changes to fix CI Signed-off-by: Dipankar Sarkar --- tests/peft/test_peft_onnx_transforms.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/peft/test_peft_onnx_transforms.py b/tests/peft/test_peft_onnx_transforms.py index f8521deb1..0248dae3b 100644 --- a/tests/peft/test_peft_onnx_transforms.py +++ b/tests/peft/test_peft_onnx_transforms.py @@ -46,6 +46,7 @@ def test_adapter_weights_to_inputs_transform(): out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name) assert transformed + assert ( onnx.printer.to_text(out_onnx) == textwrap.dedent(""" @@ -53,11 +54,11 @@ def test_adapter_weights_to_inputs_transform(): ir_version: 8, opset_import: ["" : 17] > - test_adapter_weights (float[n,32] input, float[32,32] layer1.weight, float[32,32] layer2.weight) => (float[n,32] output, float[32,32] layer1.weight_RetainedState, float[32,32] layer2.weight_RetainedState) { - layer1output = MatMul (input, layer1.weight) - output = MatMul (layer1output, layer2.weight) - layer1.weight_RetainedState = Identity (layer1.weight) - layer2.weight_RetainedState = Identity (layer2.weight) + test_adapter_weights (float[n,32] input, float[32,32] "layer1.weight", float[32,32] "layer2.weight") => (float[n,32] output, float[32,32] "layer1.weight_RetainedState", float[32,32] "layer2.weight_RetainedState") { + layer1output = MatMul (input, "layer1.weight") + output = MatMul (layer1output, "layer2.weight") + ["layer1.weight_identity"] "layer1.weight_RetainedState" = Identity ("layer1.weight") + ["layer2.weight_identity"] "layer2.weight_RetainedState" = Identity ("layer2.weight") } """).strip() ) From 8161431c17123bed433cdc371f817f4ddebacccd Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 23 May 2025 08:52:11 +0000 Subject: [PATCH 06/12] upgrading transformer to 4.51.3 Signed-off-by: Dipankar Sarkar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7c2b2d072..9700ea9db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ ] requires-python = ">=3.8,<3.11" dependencies = [ - "transformers==4.50.0", + "transformers==4.51.3", "huggingface-hub==0.27.0", "hf_transfer==0.1.9", "peft==0.13.2", From d41e948ad560f4170dcd2e89b175ff0774bd75e7 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 23 May 2025 10:46:49 +0000 Subject: [PATCH 07/12] updating huggingface-hub ==0.30.0 Signed-off-by: Dipankar Sarkar --- pyproject.toml | 2 +- tests/transformers/models/qnn_config.json | 0 .../models/test_causal_lm_models.py | 265 +++++++++--------- 3 files changed, 133 insertions(+), 134 deletions(-) create mode 100644 tests/transformers/models/qnn_config.json diff --git a/pyproject.toml b/pyproject.toml index 9700ea9db..b2009efbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ requires-python = ">=3.8,<3.11" dependencies = [ "transformers==4.51.3", - "huggingface-hub==0.27.0", + "huggingface-hub==0.30.0", "hf_transfer==0.1.9", "peft==0.13.2", "datasets==2.20.0", diff --git a/tests/transformers/models/qnn_config.json b/tests/transformers/models/qnn_config.json new file mode 100644 index 000000000..e69de29bb diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 29598f870..8e1b9d6c7 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -12,7 +12,6 @@ import pytest from transformers import AutoModelForCausalLM -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download @@ -22,40 +21,40 @@ from QEfficient.utils.run_utils import ApiRunner test_models_qaic = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "gpt2", - "Salesforce/codegen-350M-mono", - "microsoft/Phi-3-mini-4k-instruct", + # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + # "gpt2", + # "Salesforce/codegen-350M-mono", + # "microsoft/Phi-3-mini-4k-instruct", "tiiuae/falcon-7b", - "Qwen/Qwen2-0.5B", - "bigcode/starcoder2-3b", - "Felladrin/Minueza-32M-Base", - "wtang06/mpt-125m-c4", - "hakurei/gpt-j-random-tinier", - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "unsloth/gemma-2-2b", - "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model - "TheBloke/Llama-2-7B-GPTQ", # GPTQ model - "ibm-granite/granite-20b-code-base", - # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations - "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations - "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored - "ibm-granite/granite-3.1-2b-instruct", - "ibm-granite/granite-guardian-3.1-2b", + # "Qwen/Qwen2-0.5B", + # "bigcode/starcoder2-3b", + # "Felladrin/Minueza-32M-Base", + # "wtang06/mpt-125m-c4", + # "hakurei/gpt-j-random-tinier", + # "mistralai/Mixtral-8x7B-Instruct-v0.1", + # "meta-llama/Llama-3.2-1B", + # "unsloth/gemma-2b", + # "unsloth/gemma-2-2b", + # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model + # "TheBloke/Llama-2-7B-GPTQ", # GPTQ model + # "ibm-granite/granite-20b-code-base", + # # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations + # "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations + # "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored + # "ibm-granite/granite-3.1-2b-instruct", + # "ibm-granite/granite-guardian-3.1-2b", ] test_models_qnn = [ - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "ibm-granite/granite-guardian-3.1-2b", + # "mistralai/Mixtral-8x7B-Instruct-v0.1", + # "meta-llama/Llama-3.2-1B", + # "unsloth/gemma-2b", + # "ibm-granite/granite-guardian-3.1-2b", ] spd_test_models = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "Qwen/Qwen2-0.5B", + # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + # "Qwen/Qwen2-0.5B", ] @@ -215,33 +214,33 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( # FIXME: there should be a CB test here -@pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) -def test_causal_lm_export_with_deprecated_api(model_name): - model_config = {"model_name": model_name} - model_config["n_layer"] = 1 - model, _ = load_causal_lm_model(model_config) - tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) - qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) - new_api_onnx_model_path = qeff_model.export() - _, old_api_onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer - ) - - api_runner = ApiRunner( - batch_size=1, - tokenizer=tokenizer, - config=model.config, - prompt=Constants.INPUT_STR, - prompt_len=Constants.PROMPT_LEN, - ctx_len=Constants.CTX_LEN, - ) - - new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) - old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) - - assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( - "New API output does not match old API output for ONNX export function" - ) +# @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) +# def test_causal_lm_export_with_deprecated_api(model_name): +# model_config = {"model_name": model_name} +# model_config["n_layer"] = 1 +# model, _ = load_causal_lm_model(model_config) +# tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) +# qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) +# new_api_onnx_model_path = qeff_model.export() +# _, old_api_onnx_model_path = qualcomm_efficient_converter( +# model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer +# ) + +# api_runner = ApiRunner( +# batch_size=1, +# tokenizer=tokenizer, +# config=model.config, +# prompt=Constants.INPUT_STR, +# prompt_len=Constants.PROMPT_LEN, +# ctx_len=Constants.CTX_LEN, +# ) + +# new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) +# old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) + +# assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( +# "New API output does not match old API output for ONNX export function" +# ) @pytest.mark.on_qaic @@ -260,84 +259,84 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.parametrize("model_name", test_models_qnn) -def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): - """ - QNN Compilation Test - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` - """ - if model_name == "microsoft/Phi-3-mini-4k-instruct": - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 - - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path - ) - - -@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model -@pytest.mark.on_qaic -@pytest.mark.parametrize("model_name", spd_test_models) -def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): - """ - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` - """ - - if model_name == "microsoft/Phi-3-mini-4k-instruct": - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS - ) - - -@pytest.mark.on_qaic -def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): - """ - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. - """ - model_name = "gpt2" - prompt_len = 1 - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) - - -@pytest.mark.on_qaic -@pytest.mark.qnn -def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn(): - """ - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. - """ - model_name = "gpt2" - prompt_len = 1 - - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path - ) - - -@pytest.mark.on_qaic -def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): - model_name = "gpt2" - n_layer = 1 - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True) - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False) +# @pytest.mark.on_qaic +# @pytest.mark.qnn +# @pytest.mark.parametrize("model_name", test_models_qnn) +# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): +# """ +# QNN Compilation Test +# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. +# ``Mandatory`` Args: +# :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` +# """ +# if model_name == "microsoft/Phi-3-mini-4k-instruct": +# n_layer = 2 # test only 2 layer models +# else: +# n_layer = 1 + +# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") +# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( +# model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path +# ) + + +# @pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model +# @pytest.mark.on_qaic +# @pytest.mark.parametrize("model_name", spd_test_models) +# def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): +# """ +# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. +# ``Mandatory`` Args: +# :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` +# """ + +# if model_name == "microsoft/Phi-3-mini-4k-instruct": +# n_layer = 2 # test only 2 layer models +# else: +# n_layer = 1 + +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( +# model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS +# ) + + +# @pytest.mark.on_qaic +# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): +# """ +# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. +# """ +# model_name = "gpt2" +# prompt_len = 1 + +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) + + +# @pytest.mark.on_qaic +# @pytest.mark.qnn +# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn(): +# """ +# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. +# """ +# model_name = "gpt2" +# prompt_len = 1 + +# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") +# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( +# model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path +# ) + + +# @pytest.mark.on_qaic +# def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): +# model_name = "gpt2" +# n_layer = 1 +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True) + +# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False) @pytest.mark.on_qaic From eec0631ee8354783cb9177eba184277f7449e68b Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 27 May 2025 06:36:03 +0000 Subject: [PATCH 08/12] upgrade transformer version==4.52.3 Signed-off-by: Dipankar Sarkar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b2009efbb..7ebd9092e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ ] requires-python = ">=3.8,<3.11" dependencies = [ - "transformers==4.51.3", + "transformers==4.52.3", "huggingface-hub==0.30.0", "hf_transfer==0.1.9", "peft==0.13.2", From a3ddf64b49d71753659d4e90c0c33c7fbabdd31e Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 27 May 2025 06:58:41 +0000 Subject: [PATCH 09/12] Cleaning done 1 Signed-off-by: Dipankar Sarkar --- .../models/test_causal_lm_models.py | 265 +++++++++--------- 1 file changed, 133 insertions(+), 132 deletions(-) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 8e1b9d6c7..29598f870 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -12,6 +12,7 @@ import pytest from transformers import AutoModelForCausalLM +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download @@ -21,40 +22,40 @@ from QEfficient.utils.run_utils import ApiRunner test_models_qaic = [ - # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - # "gpt2", - # "Salesforce/codegen-350M-mono", - # "microsoft/Phi-3-mini-4k-instruct", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "gpt2", + "Salesforce/codegen-350M-mono", + "microsoft/Phi-3-mini-4k-instruct", "tiiuae/falcon-7b", - # "Qwen/Qwen2-0.5B", - # "bigcode/starcoder2-3b", - # "Felladrin/Minueza-32M-Base", - # "wtang06/mpt-125m-c4", - # "hakurei/gpt-j-random-tinier", - # "mistralai/Mixtral-8x7B-Instruct-v0.1", - # "meta-llama/Llama-3.2-1B", - # "unsloth/gemma-2b", - # "unsloth/gemma-2-2b", - # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model - # "TheBloke/Llama-2-7B-GPTQ", # GPTQ model - # "ibm-granite/granite-20b-code-base", - # # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations - # "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations - # "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored - # "ibm-granite/granite-3.1-2b-instruct", - # "ibm-granite/granite-guardian-3.1-2b", + "Qwen/Qwen2-0.5B", + "bigcode/starcoder2-3b", + "Felladrin/Minueza-32M-Base", + "wtang06/mpt-125m-c4", + "hakurei/gpt-j-random-tinier", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "meta-llama/Llama-3.2-1B", + "unsloth/gemma-2b", + "unsloth/gemma-2-2b", + "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model + "TheBloke/Llama-2-7B-GPTQ", # GPTQ model + "ibm-granite/granite-20b-code-base", + # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations + "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations + "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored + "ibm-granite/granite-3.1-2b-instruct", + "ibm-granite/granite-guardian-3.1-2b", ] test_models_qnn = [ - # "mistralai/Mixtral-8x7B-Instruct-v0.1", - # "meta-llama/Llama-3.2-1B", - # "unsloth/gemma-2b", - # "ibm-granite/granite-guardian-3.1-2b", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "meta-llama/Llama-3.2-1B", + "unsloth/gemma-2b", + "ibm-granite/granite-guardian-3.1-2b", ] spd_test_models = [ - # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - # "Qwen/Qwen2-0.5B", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "Qwen/Qwen2-0.5B", ] @@ -214,33 +215,33 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( # FIXME: there should be a CB test here -# @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) -# def test_causal_lm_export_with_deprecated_api(model_name): -# model_config = {"model_name": model_name} -# model_config["n_layer"] = 1 -# model, _ = load_causal_lm_model(model_config) -# tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) -# qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) -# new_api_onnx_model_path = qeff_model.export() -# _, old_api_onnx_model_path = qualcomm_efficient_converter( -# model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer -# ) - -# api_runner = ApiRunner( -# batch_size=1, -# tokenizer=tokenizer, -# config=model.config, -# prompt=Constants.INPUT_STR, -# prompt_len=Constants.PROMPT_LEN, -# ctx_len=Constants.CTX_LEN, -# ) - -# new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) -# old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) - -# assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( -# "New API output does not match old API output for ONNX export function" -# ) +@pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) +def test_causal_lm_export_with_deprecated_api(model_name): + model_config = {"model_name": model_name} + model_config["n_layer"] = 1 + model, _ = load_causal_lm_model(model_config) + tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) + qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) + new_api_onnx_model_path = qeff_model.export() + _, old_api_onnx_model_path = qualcomm_efficient_converter( + model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer + ) + + api_runner = ApiRunner( + batch_size=1, + tokenizer=tokenizer, + config=model.config, + prompt=Constants.INPUT_STR, + prompt_len=Constants.PROMPT_LEN, + ctx_len=Constants.CTX_LEN, + ) + + new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) + old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) + + assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( + "New API output does not match old API output for ONNX export function" + ) @pytest.mark.on_qaic @@ -259,84 +260,84 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) -# @pytest.mark.on_qaic -# @pytest.mark.qnn -# @pytest.mark.parametrize("model_name", test_models_qnn) -# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): -# """ -# QNN Compilation Test -# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. -# ``Mandatory`` Args: -# :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` -# """ -# if model_name == "microsoft/Phi-3-mini-4k-instruct": -# n_layer = 2 # test only 2 layer models -# else: -# n_layer = 1 - -# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") -# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( -# model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path -# ) - - -# @pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model -# @pytest.mark.on_qaic -# @pytest.mark.parametrize("model_name", spd_test_models) -# def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): -# """ -# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. -# ``Mandatory`` Args: -# :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` -# """ - -# if model_name == "microsoft/Phi-3-mini-4k-instruct": -# n_layer = 2 # test only 2 layer models -# else: -# n_layer = 1 - -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( -# model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS -# ) - - -# @pytest.mark.on_qaic -# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): -# """ -# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. -# """ -# model_name = "gpt2" -# prompt_len = 1 - -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) - - -# @pytest.mark.on_qaic -# @pytest.mark.qnn -# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn(): -# """ -# Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. -# """ -# model_name = "gpt2" -# prompt_len = 1 - -# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") -# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( -# model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path -# ) - - -# @pytest.mark.on_qaic -# def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): -# model_name = "gpt2" -# n_layer = 1 -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True) - -# check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False) +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.parametrize("model_name", test_models_qnn) +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): + """ + QNN Compilation Test + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + if model_name == "microsoft/Phi-3-mini-4k-instruct": + n_layer = 2 # test only 2 layer models + else: + n_layer = 1 + + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path + ) + + +@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", spd_test_models) +def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + + if model_name == "microsoft/Phi-3-mini-4k-instruct": + n_layer = 2 # test only 2 layer models + else: + n_layer = 1 + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS + ) + + +@pytest.mark.on_qaic +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. + """ + model_name = "gpt2" + prompt_len = 1 + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn(): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. + """ + model_name = "gpt2" + prompt_len = 1 + + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path + ) + + +@pytest.mark.on_qaic +def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): + model_name = "gpt2" + n_layer = 1 + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False) @pytest.mark.on_qaic From b65bee03fd4e07ee83ceac7f7258da3016dd1497 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 5 Jun 2025 18:05:43 +0000 Subject: [PATCH 10/12] Removing transformer update and huggingface hub Signed-off-by: Dipankar Sarkar --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7ebd9092e..7c2b2d072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,8 @@ classifiers = [ ] requires-python = ">=3.8,<3.11" dependencies = [ - "transformers==4.52.3", - "huggingface-hub==0.30.0", + "transformers==4.50.0", + "huggingface-hub==0.27.0", "hf_transfer==0.1.9", "peft==0.13.2", "datasets==2.20.0", From 8a1681126a0414e46837f0a20589ad7ffb6dd490 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 5 Jun 2025 18:14:50 +0000 Subject: [PATCH 11/12] cleaning done 2 Signed-off-by: Dipankar Sarkar --- tests/transformers/models/qnn_config.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/transformers/models/qnn_config.json diff --git a/tests/transformers/models/qnn_config.json b/tests/transformers/models/qnn_config.json deleted file mode 100644 index e69de29bb..000000000 From d067a6cf5d8e41800f205f378b1aca85653f6650 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 6 Jun 2025 05:21:37 +0000 Subject: [PATCH 12/12] cleaning done 3 Signed-off-by: Dipankar Sarkar --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7c2b2d072..ee760734c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ requires-python = ">=3.8,<3.11" dependencies = [ "transformers==4.50.0", "huggingface-hub==0.27.0", + "hf_transfer==0.1.9", "peft==0.13.2", "datasets==2.20.0",