From 0e30c2efc10875c3e667163e1d73e7e6de933468 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Thu, 15 May 2025 18:52:12 +0000
Subject: [PATCH 01/12] Upgrading onnx onxrt and onnxscript

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 648d2ce4e..4ca821335 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,11 +28,11 @@ dependencies = [
     "multidict==6.0.4",
     "urllib3<2",
     "sentencepiece==0.2.0",
-    "onnx==1.16.0",
-    "onnxruntime==1.16.3",
+    "onnx==1.17.0",
+    "onnxruntime==1.22.0",
     "numpy==1.26.4",
     "protobuf==3.20.2",
-    "onnxscript==0.1.0.dev20240327",
+    "onnxscript==0.2.5",
     "pillow===10.4.0",
     "sympy",
     "tensorboard",

From c7e894bebd8396ed652066eb3594976f07323879 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Thu, 15 May 2025 19:48:38 +0000
Subject: [PATCH 02/12] Minor updates

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4ca821335..348632da0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,8 +28,8 @@ dependencies = [
     "multidict==6.0.4",
     "urllib3<2",
     "sentencepiece==0.2.0",
-    "onnx==1.17.0",
-    "onnxruntime==1.22.0",
+    "onnx==1.18.0",
+    "onnxruntime==1.22",
     "numpy==1.26.4",
     "protobuf==3.20.2",
     "onnxscript==0.2.5",

From 3d78714c8758b891970c65e522aa5d1fdc36f5e4 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Fri, 16 May 2025 09:00:23 +0000
Subject: [PATCH 03/12] Updating protobuff

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 348632da0..3be995b89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "onnx==1.18.0",
     "onnxruntime==1.22",
     "numpy==1.26.4",
-    "protobuf==3.20.2",
+    "protobuf==4.25.1",
     "onnxscript==0.2.5",
     "pillow===10.4.0",
     "sympy",

From 36124e043812be3f5e1c689c3096d703c301caec Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Wed, 21 May 2025 12:15:02 +0000
Subject: [PATCH 04/12] upgrading protobuff to 6.31.0

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3be995b89..7c2b2d072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "onnx==1.18.0",
     "onnxruntime==1.22",
     "numpy==1.26.4",
-    "protobuf==4.25.1",
+    "protobuf==6.31.0",
     "onnxscript==0.2.5",
     "pillow===10.4.0",
     "sympy",

From 8ab8d782b2901e6ac0e0e839207604a6a8e5b3e3 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Thu, 22 May 2025 07:00:53 +0000
Subject: [PATCH 05/12] Changes to fix CI

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 tests/peft/test_peft_onnx_transforms.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/peft/test_peft_onnx_transforms.py b/tests/peft/test_peft_onnx_transforms.py
index f8521deb1..0248dae3b 100644
--- a/tests/peft/test_peft_onnx_transforms.py
+++ b/tests/peft/test_peft_onnx_transforms.py
@@ -46,6 +46,7 @@ def test_adapter_weights_to_inputs_transform():
 
     out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name)
     assert transformed
+
     assert (
         onnx.printer.to_text(out_onnx)
         == textwrap.dedent("""
@@ -53,11 +54,11 @@ def test_adapter_weights_to_inputs_transform():
        ir_version: 8,
        opset_import: ["" : 17]
     >
-    test_adapter_weights (float[n,32] input, float[32,32] layer1.weight, float[32,32] layer2.weight) => (float[n,32] output, float[32,32] layer1.weight_RetainedState, float[32,32] layer2.weight_RetainedState) {
-       layer1output = MatMul (input, layer1.weight)
-       output = MatMul (layer1output, layer2.weight)
-       layer1.weight_RetainedState = Identity (layer1.weight)
-       layer2.weight_RetainedState = Identity (layer2.weight)
+    test_adapter_weights (float[n,32] input, float[32,32] "layer1.weight", float[32,32] "layer2.weight") => (float[n,32] output, float[32,32] "layer1.weight_RetainedState", float[32,32] "layer2.weight_RetainedState") {
+       layer1output = MatMul (input, "layer1.weight")
+       output = MatMul (layer1output, "layer2.weight")
+       ["layer1.weight_identity"] "layer1.weight_RetainedState" = Identity ("layer1.weight")
+       ["layer2.weight_identity"] "layer2.weight_RetainedState" = Identity ("layer2.weight")
     }
     """).strip()
     )

From 8161431c17123bed433cdc371f817f4ddebacccd Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Fri, 23 May 2025 08:52:11 +0000
Subject: [PATCH 06/12] upgrading transformer to 4.51.3

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7c2b2d072..9700ea9db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.50.0",
+    "transformers==4.51.3",
     "huggingface-hub==0.27.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",

From d41e948ad560f4170dcd2e89b175ff0774bd75e7 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Fri, 23 May 2025 10:46:49 +0000
Subject: [PATCH 07/12] updating huggingface-hub ==0.30.0

Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
---
 pyproject.toml                                |   2 +-
 tests/transformers/models/qnn_config.json     |   0
 .../models/test_causal_lm_models.py           | 265 +++++++++---------
 3 files changed, 133 insertions(+), 134 deletions(-)
 create mode 100644 tests/transformers/models/qnn_config.json

diff --git a/pyproject.toml b/pyproject.toml
index 9700ea9db..b2009efbb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.51.3",
-    "huggingface-hub==0.27.0",
+    "huggingface-hub==0.30.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",
diff --git a/tests/transformers/models/qnn_config.json b/tests/transformers/models/qnn_config.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 29598f870..8e1b9d6c7 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -12,7 +12,6 @@
 import pytest
 from transformers import AutoModelForCausalLM
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
 from QEfficient.utils import hf_download
@@ -22,40 +21,40 @@
 from QEfficient.utils.run_utils import ApiRunner
 
 test_models_qaic = [
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "gpt2",
-    "Salesforce/codegen-350M-mono",
-    "microsoft/Phi-3-mini-4k-instruct",
+    # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    # "gpt2",
+    # "Salesforce/codegen-350M-mono",
+    # "microsoft/Phi-3-mini-4k-instruct",
     "tiiuae/falcon-7b",
-    "Qwen/Qwen2-0.5B",
-    "bigcode/starcoder2-3b",
-    "Felladrin/Minueza-32M-Base",
-    "wtang06/mpt-125m-c4",
-    "hakurei/gpt-j-random-tinier",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "meta-llama/Llama-3.2-1B",
-    "unsloth/gemma-2b",
-    "unsloth/gemma-2-2b",
-    "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
-    "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
-    "ibm-granite/granite-20b-code-base",
-    # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
-    "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
-    "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
-    "ibm-granite/granite-3.1-2b-instruct",
-    "ibm-granite/granite-guardian-3.1-2b",
+    # "Qwen/Qwen2-0.5B",
+    # "bigcode/starcoder2-3b",
+    # "Felladrin/Minueza-32M-Base",
+    # "wtang06/mpt-125m-c4",
+    # "hakurei/gpt-j-random-tinier",
+    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    # "meta-llama/Llama-3.2-1B",
+    # "unsloth/gemma-2b",
+    # "unsloth/gemma-2-2b",
+    # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
+    # "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
+    # "ibm-granite/granite-20b-code-base",
+    # # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
+    # "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
+    # "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
+    # "ibm-granite/granite-3.1-2b-instruct",
+    # "ibm-granite/granite-guardian-3.1-2b",
 ]
 
 test_models_qnn = [
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "meta-llama/Llama-3.2-1B",
-    "unsloth/gemma-2b",
-    "ibm-granite/granite-guardian-3.1-2b",
+    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    # "meta-llama/Llama-3.2-1B",
+    # "unsloth/gemma-2b",
+    # "ibm-granite/granite-guardian-3.1-2b",
 ]
 
 spd_test_models = [
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "Qwen/Qwen2-0.5B",
+    # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    # "Qwen/Qwen2-0.5B",
 ]
 
 
@@ -215,33 +214,33 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
 
 
 # FIXME: there should be a CB test here
-@pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
-def test_causal_lm_export_with_deprecated_api(model_name):
-    model_config = {"model_name": model_name}
-    model_config["n_layer"] = 1
-    model, _ = load_causal_lm_model(model_config)
-    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
-    qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
-    new_api_onnx_model_path = qeff_model.export()
-    _, old_api_onnx_model_path = qualcomm_efficient_converter(
-        model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
-    )
-
-    api_runner = ApiRunner(
-        batch_size=1,
-        tokenizer=tokenizer,
-        config=model.config,
-        prompt=Constants.INPUT_STR,
-        prompt_len=Constants.PROMPT_LEN,
-        ctx_len=Constants.CTX_LEN,
-    )
-
-    new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
-    old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
-
-    assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
-        "New API output does not match old API output for ONNX export function"
-    )
+# @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
+# def test_causal_lm_export_with_deprecated_api(model_name):
+#     model_config = {"model_name": model_name}
+#     model_config["n_layer"] = 1
+#     model, _ = load_causal_lm_model(model_config)
+#     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+#     qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
+#     new_api_onnx_model_path = qeff_model.export()
+#     _, old_api_onnx_model_path = qualcomm_efficient_converter(
+#         model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
+#     )
+
+#     api_runner = ApiRunner(
+#         batch_size=1,
+#         tokenizer=tokenizer,
+#         config=model.config,
+#         prompt=Constants.INPUT_STR,
+#         prompt_len=Constants.PROMPT_LEN,
+#         ctx_len=Constants.CTX_LEN,
+#     )
+
+#     new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
+#     old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
+
+#     assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
+#         "New API output does not match old API output for ONNX export function"
+#     )
 
 
 @pytest.mark.on_qaic
@@ -260,84 +259,84 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model_name", test_models_qnn)
-def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
-    """
-    QNN Compilation Test
-    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-    ``Mandatory`` Args:
-        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-    """
-    if model_name == "microsoft/Phi-3-mini-4k-instruct":
-        n_layer = 2  # test only 2 layer models
-    else:
-        n_layer = 1
-
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-        model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
-
-
-@pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model_name", spd_test_models)
-def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
-    """
-    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-    ``Mandatory`` Args:
-        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-    """
-
-    if model_name == "microsoft/Phi-3-mini-4k-instruct":
-        n_layer = 2  # test only 2 layer models
-    else:
-        n_layer = 1
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-        model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS
-    )
-
-
-@pytest.mark.on_qaic
-def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
-    """
-    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
-    """
-    model_name = "gpt2"
-    prompt_len = 1
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
-
-
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
-    """
-    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
-    """
-    model_name = "gpt2"
-    prompt_len = 1
-
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-        model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
-
-
-@pytest.mark.on_qaic
-def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
-    model_name = "gpt2"
-    n_layer = 1
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# @pytest.mark.parametrize("model_name", test_models_qnn)
+# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
+#     """
+#     QNN Compilation Test
+#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+#     ``Mandatory`` Args:
+#         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+#     """
+#     if model_name == "microsoft/Phi-3-mini-4k-instruct":
+#         n_layer = 2  # test only 2 layer models
+#     else:
+#         n_layer = 1
+
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+#         model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
+#     )
+
+
+# @pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
+# @pytest.mark.on_qaic
+# @pytest.mark.parametrize("model_name", spd_test_models)
+# def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+#     """
+#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+#     ``Mandatory`` Args:
+#         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+#     """
+
+#     if model_name == "microsoft/Phi-3-mini-4k-instruct":
+#         n_layer = 2  # test only 2 layer models
+#     else:
+#         n_layer = 1
+
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+#         model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS
+#     )
+
+
+# @pytest.mark.on_qaic
+# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
+#     """
+#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
+#     """
+#     model_name = "gpt2"
+#     prompt_len = 1
+
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
+
+
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
+#     """
+#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
+#     """
+#     model_name = "gpt2"
+#     prompt_len = 1
+
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+#         model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
+#     )
+
+
+# @pytest.mark.on_qaic
+# def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
+#     model_name = "gpt2"
+#     n_layer = 1
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
+
+#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
 
 
 @pytest.mark.on_qaic

From eec0631ee8354783cb9177eba184277f7449e68b Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Tue, 27 May 2025 06:36:03 +0000
Subject: [PATCH 08/12] upgrade transformer version==4.52.3

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b2009efbb..7ebd9092e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.51.3",
+    "transformers==4.52.3",
     "huggingface-hub==0.30.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",

From a3ddf64b49d71753659d4e90c0c33c7fbabdd31e Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Tue, 27 May 2025 06:58:41 +0000
Subject: [PATCH 09/12] Cleaning done 1

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 .../models/test_causal_lm_models.py           | 265 +++++++++---------
 1 file changed, 133 insertions(+), 132 deletions(-)

diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 8e1b9d6c7..29598f870 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -12,6 +12,7 @@
 import pytest
 from transformers import AutoModelForCausalLM
 
+from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
 from QEfficient.utils import hf_download
@@ -21,40 +22,40 @@
 from QEfficient.utils.run_utils import ApiRunner
 
 test_models_qaic = [
-    # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    # "gpt2",
-    # "Salesforce/codegen-350M-mono",
-    # "microsoft/Phi-3-mini-4k-instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "gpt2",
+    "Salesforce/codegen-350M-mono",
+    "microsoft/Phi-3-mini-4k-instruct",
     "tiiuae/falcon-7b",
-    # "Qwen/Qwen2-0.5B",
-    # "bigcode/starcoder2-3b",
-    # "Felladrin/Minueza-32M-Base",
-    # "wtang06/mpt-125m-c4",
-    # "hakurei/gpt-j-random-tinier",
-    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    # "meta-llama/Llama-3.2-1B",
-    # "unsloth/gemma-2b",
-    # "unsloth/gemma-2-2b",
-    # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
-    # "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
-    # "ibm-granite/granite-20b-code-base",
-    # # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
-    # "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
-    # "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
-    # "ibm-granite/granite-3.1-2b-instruct",
-    # "ibm-granite/granite-guardian-3.1-2b",
+    "Qwen/Qwen2-0.5B",
+    "bigcode/starcoder2-3b",
+    "Felladrin/Minueza-32M-Base",
+    "wtang06/mpt-125m-c4",
+    "hakurei/gpt-j-random-tinier",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "meta-llama/Llama-3.2-1B",
+    "unsloth/gemma-2b",
+    "unsloth/gemma-2-2b",
+    "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
+    "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
+    "ibm-granite/granite-20b-code-base",
+    # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
+    "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
+    "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
+    "ibm-granite/granite-3.1-2b-instruct",
+    "ibm-granite/granite-guardian-3.1-2b",
 ]
 
 test_models_qnn = [
-    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    # "meta-llama/Llama-3.2-1B",
-    # "unsloth/gemma-2b",
-    # "ibm-granite/granite-guardian-3.1-2b",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "meta-llama/Llama-3.2-1B",
+    "unsloth/gemma-2b",
+    "ibm-granite/granite-guardian-3.1-2b",
 ]
 
 spd_test_models = [
-    # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    # "Qwen/Qwen2-0.5B",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "Qwen/Qwen2-0.5B",
 ]
 
 
@@ -214,33 +215,33 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
 
 
 # FIXME: there should be a CB test here
-# @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
-# def test_causal_lm_export_with_deprecated_api(model_name):
-#     model_config = {"model_name": model_name}
-#     model_config["n_layer"] = 1
-#     model, _ = load_causal_lm_model(model_config)
-#     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
-#     qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
-#     new_api_onnx_model_path = qeff_model.export()
-#     _, old_api_onnx_model_path = qualcomm_efficient_converter(
-#         model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
-#     )
-
-#     api_runner = ApiRunner(
-#         batch_size=1,
-#         tokenizer=tokenizer,
-#         config=model.config,
-#         prompt=Constants.INPUT_STR,
-#         prompt_len=Constants.PROMPT_LEN,
-#         ctx_len=Constants.CTX_LEN,
-#     )
-
-#     new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
-#     old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
-
-#     assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
-#         "New API output does not match old API output for ONNX export function"
-#     )
+@pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
+def test_causal_lm_export_with_deprecated_api(model_name):
+    model_config = {"model_name": model_name}
+    model_config["n_layer"] = 1
+    model, _ = load_causal_lm_model(model_config)
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+    qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
+    new_api_onnx_model_path = qeff_model.export()
+    _, old_api_onnx_model_path = qualcomm_efficient_converter(
+        model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
+    )
+
+    api_runner = ApiRunner(
+        batch_size=1,
+        tokenizer=tokenizer,
+        config=model.config,
+        prompt=Constants.INPUT_STR,
+        prompt_len=Constants.PROMPT_LEN,
+        ctx_len=Constants.CTX_LEN,
+    )
+
+    new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
+    old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
+
+    assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
+        "New API output does not match old API output for ONNX export function"
+    )
 
 
 @pytest.mark.on_qaic
@@ -259,84 +260,84 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
-# @pytest.mark.on_qaic
-# @pytest.mark.qnn
-# @pytest.mark.parametrize("model_name", test_models_qnn)
-# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
-#     """
-#     QNN Compilation Test
-#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-#     ``Mandatory`` Args:
-#         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-#     """
-#     if model_name == "microsoft/Phi-3-mini-4k-instruct":
-#         n_layer = 2  # test only 2 layer models
-#     else:
-#         n_layer = 1
-
-#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-#         model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
-#     )
-
-
-# @pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
-# @pytest.mark.on_qaic
-# @pytest.mark.parametrize("model_name", spd_test_models)
-# def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
-#     """
-#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-#     ``Mandatory`` Args:
-#         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-#     """
-
-#     if model_name == "microsoft/Phi-3-mini-4k-instruct":
-#         n_layer = 2  # test only 2 layer models
-#     else:
-#         n_layer = 1
-
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-#         model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS
-#     )
-
-
-# @pytest.mark.on_qaic
-# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
-#     """
-#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
-#     """
-#     model_name = "gpt2"
-#     prompt_len = 1
-
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
-
-
-# @pytest.mark.on_qaic
-# @pytest.mark.qnn
-# def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
-#     """
-#     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
-#     """
-#     model_name = "gpt2"
-#     prompt_len = 1
-
-#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-#         model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
-#     )
-
-
-# @pytest.mark.on_qaic
-# def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
-#     model_name = "gpt2"
-#     n_layer = 1
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
-
-#     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", test_models_qnn)
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation Test
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "microsoft/Phi-3-mini-4k-instruct":
+        n_layer = 2  # test only 2 layer models
+    else:
+        n_layer = 1
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+
+@pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", spd_test_models)
+def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+
+    if model_name == "microsoft/Phi-3-mini-4k-instruct":
+        n_layer = 2  # test only 2 layer models
+    else:
+        n_layer = 1
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS
+    )
+
+
+@pytest.mark.on_qaic
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
+    """
+    model_name = "gpt2"
+    prompt_len = 1
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
+    """
+    model_name = "gpt2"
+    prompt_len = 1
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+
+@pytest.mark.on_qaic
+def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
+    model_name = "gpt2"
+    n_layer = 1
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
 
 
 @pytest.mark.on_qaic

From b65bee03fd4e07ee83ceac7f7258da3016dd1497 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Thu, 5 Jun 2025 18:05:43 +0000
Subject: [PATCH 10/12] Removing transformer update and huggingface hub

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7ebd9092e..7c2b2d072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,8 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.52.3",
-    "huggingface-hub==0.30.0",
+    "transformers==4.50.0",
+    "huggingface-hub==0.27.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",

From 8a1681126a0414e46837f0a20589ad7ffb6dd490 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Thu, 5 Jun 2025 18:14:50 +0000
Subject: [PATCH 11/12] cleaning done 2

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 tests/transformers/models/qnn_config.json | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/transformers/models/qnn_config.json

diff --git a/tests/transformers/models/qnn_config.json b/tests/transformers/models/qnn_config.json
deleted file mode 100644
index e69de29bb..000000000

From d067a6cf5d8e41800f205f378b1aca85653f6650 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Fri, 6 Jun 2025 05:21:37 +0000
Subject: [PATCH 12/12] cleaning done 3

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7c2b2d072..ee760734c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.50.0",
     "huggingface-hub==0.27.0",
+    
     "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",