diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py index 2e17aa886..3d70ac4f3 100644 --- a/QEfficient/utils/test_utils.py +++ b/QEfficient/utils/test_utils.py @@ -150,3 +150,24 @@ def __call__( image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN query = query.replace("", image_tokens, 1) return query + + +class ModelConfig: + """ + Contains all the model types which are not default model like quantized models, external models, swiftkv models etc,. + """ + + QUANTIZED_MODELS = { + "neuralmagic/Qwen2-0.5B-Instruct-FP8", + "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "TheBloke/Llama-2-7B-GPTQ", + "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + } + + EXTERNAL_MODELS = { + "hpcai-tech/grok-1", + } + + SWIFTKV_MODELS = { + "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", + } diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 103c04b73..50b0f3e42 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -59,7 +59,7 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not nightly) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && junitparser merge tests/tests_log2.xml tests/tests_log.xml && deactivate" ''' @@ -97,7 +97,7 @@ pipeline { mkdir -p $PWD/cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/cli && - pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log3.xml && + pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log3.xml && junitparser merge tests/tests_log3.xml tests/tests_log.xml && deactivate" ''' @@ -126,7 +126,7 @@ pipeline { mkdir -p $PWD/Qnn_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_cli && - pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log4.xml && + pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log4.xml && junitparser merge tests/tests_log4.xml tests/tests_log.xml && deactivate" ''' @@ -145,7 +145,7 @@ pipeline { mkdir -p $PWD/Qnn_non_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_non_cli && - pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log5.xml && + pytest tests -m '(not cli) and (qnn) and (not nightly) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log5.xml && junitparser merge tests/tests_log5.xml tests/tests_log.xml && deactivate" ''' diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py deleted file mode 100644 index 8b17297ac..000000000 --- a/tests/cloud/conftest.py +++ /dev/null @@ -1,328 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import json -import os -import shutil - -import pytest - -from QEfficient.utils import get_onnx_dir_name -from QEfficient.utils.constants import QEFF_MODELS_DIR -from QEfficient.utils.logging_utils import logger - - -def pytest_addoption(parser): - parser.addoption("--all", action="store_true", default=False, help="Run all test without skipping any test") - - -model_class_dict = {"gpt2": "GPT2LMHeadModel", "lu-vae/llama-68m-fft": "LlamaForCausalLM"} - - -class ModelSetup: - """ - ModelSetup is a set up class for all the High Level testing script, - which provides all neccessary objects needed for checking the flow and creation - of the HL API code. - """ - - def __init__( - self, - model_name, - num_cores, - prompt, - prompts_txt_file_path, - aic_enable_depth_first, - mos, - cache_dir, - hf_token, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - full_batch_size, - device_group, - enable_qnn, - qnn_config, - ): - """ - Initialization set up - ------ - param: model_name: str - param: num_cores: int - param: prompt: str - param: prompts_txt_file_path: str - param: aic_enable_depth_first: bool - param: mos: int - param: cache_dir: str - param: hf_token: str - param: batch_size: int - param: prompt_len: int - param: ctx_len: int - param: mxfp6: bool - param: mxint8: bool - param: full_batch_size: int - param: device_group: List[int] - param: enable_qnn: bool - param: qnn_config: str - """ - self.model_name = model_name - self.num_cores = num_cores - self.prompt = prompt - self.local_model_dir = None - self.prompts_txt_file_path = prompts_txt_file_path if prompts_txt_file_path is not None else None - self.aic_enable_depth_first = aic_enable_depth_first - self.mos = mos - self.cache_dir = cache_dir - self.hf_token = hf_token - self.batch_size = batch_size - self.prompt_len = prompt_len - self.ctx_len = ctx_len - self.generation_len = None - self.mxfp6 = mxfp6 - self.mxint8 = mxint8 - self.full_batch_size = full_batch_size - self.device_group = device_group - self.enable_qnn = enable_qnn - self.qnn_config = qnn_config - - def model_card_dir(self): - return str(os.path.join(QEFF_MODELS_DIR, str(self.model_name))) - - def qpc_base_dir_path(self): - base_dir_name = str( - f"qpc{'_qnn_' if self.enable_qnn else '_'}{self.num_cores}cores_{self.batch_size}bs_{self.prompt_len}pl_{self.ctx_len}cl_{self.mos}mos" - + f"{f'_{self.full_batch_size}fbs_' if self.full_batch_size is not None else '_'}" - + f"{len(self.device_group) if self.device_group is not None else 1}" - + "devices" - + ( - "_mxfp6_mxint8" - if (self.mxfp6 and self.mxint8) - else "_mxfp6" - if self.mxfp6 - else "_fp16_mxint8" - if self.mxint8 - else "_fp16" - ) - ) - return str(os.path.join(self.model_card_dir(), base_dir_name)) - - def qpc_dir_path(self): - return str(os.path.join(self.qpc_base_dir_path(), "qpcs")) - - def onnx_dir_name(self): - return get_onnx_dir_name(self.model_name, self.full_batch_size is not None) - - def onnx_dir_path(self): - return str(os.path.join(self.model_card_dir(), self.onnx_dir_name())) - - def onnx_model_path(self): - return [ - str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")), - str(os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv.onnx")), - ] - - def model_hf_path(self): - return str(os.path.join(self.cache_dir, self.model_name)) - - def base_path_and_generated_onnx_path(self): - return str(self.onnx_dir_path()), str( - os.path.join(self.onnx_dir_path(), self.model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - ) - - def specialization_json_path(self): - return str(os.path.join(self.qpc_base_dir_path(), "specializations.json")) - - def custom_io_file_path(self): - if self.mxint8: - return str(os.path.join(self.onnx_dir_path(), "custom_io_int8.yaml")) - else: - return str(os.path.join(self.onnx_dir_path(), "custom_io_fp16.yaml")) - - -@pytest.fixture(scope="function") -def setup( - model_name, - num_cores, - prompt, - prompts_txt_file_path, - aic_enable_depth_first, - mos, - cache_dir, - hf_token, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - full_batch_size, - device_group, - enable_qnn, - qnn_config, -): - """ - It is a fixture or shared object of all testing script within or inner folder, - Args are coming from the dynamically generated tests method i.e, pytest_generate_tests via testing script or method - -------- - Args: same as set up initialization - Return: model_setup class object - """ - model_setup = ModelSetup( - model_name, - num_cores, - prompt, - prompts_txt_file_path, - bool(aic_enable_depth_first), - mos, - cache_dir, - hf_token, - batch_size, - prompt_len, - ctx_len, - bool(mxfp6), - bool(mxint8), - full_batch_size, - device_group, - enable_qnn, - qnn_config, - ) - - yield model_setup - del model_setup - - -def pytest_generate_tests(metafunc): - """ - pytest_generate_tests hook is used to create our own input parametrization, - It generates all the test cases of different combination of input parameters which are read from the json file, - and passed to each testing script module. - ----------- - Ref: https://docs.pytest.org/en/7.3.x/how-to/parametrize.html - """ - json_file = "tests/cloud/high_level_testing.json" - with open(json_file, "r") as file: - json_data = json.load(file) - - metafunc.parametrize("model_name", json_data["model_name"], ids=lambda x: "model_name=" + str(x)) - metafunc.parametrize("num_cores", json_data["num_cores"], ids=lambda x: "num_cores=" + str(x)) - metafunc.parametrize("prompt", json_data["prompt"], ids=lambda x: "prompt=" + str(x)) - metafunc.parametrize( - "prompts_txt_file_path", json_data["prompts_txt_file_path"], ids=lambda x: "prompts_txt_file_path=" + str(x) - ) - metafunc.parametrize( - "aic_enable_depth_first", json_data["aic_enable_depth_first"], ids=lambda x: "aic_enable_depth_first=" + str(x) - ) - metafunc.parametrize("mos", json_data["mos"], ids=lambda x: "mos=" + str(x)) - metafunc.parametrize("cache_dir", [None], ids=lambda x: "cache_dir=" + str(x)) - metafunc.parametrize("hf_token", json_data["hf_token"], ids=lambda x: "hf_token=" + str(x)) - metafunc.parametrize("batch_size", json_data["batch_size"], ids=lambda x: "batch_size=" + str(x)) - metafunc.parametrize("prompt_len", json_data["prompt_len"], ids=lambda x: "prompt_len=" + str(x)) - metafunc.parametrize("ctx_len", json_data["ctx_len"], ids=lambda x: "ctx_len=" + str(x)) - metafunc.parametrize("mxfp6", json_data["mxfp6"], ids=lambda x: "mxfp6=" + str(x)) - metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x)) - metafunc.parametrize("full_batch_size", json_data["full_batch_size"], ids=lambda x: "full_batch_size=" + str(x)) - metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x)) - metafunc.parametrize("enable_qnn", json_data["enable_qnn"], ids=lambda x: "enable_qnn=" + str(x)) - metafunc.parametrize("qnn_config", json_data["qnn_config"], ids=lambda x: "qnn_config=" + str(x)) - - -def pytest_collection_modifyitems(config, items): - """ - pytest_collection_modifyitems is pytest a hook, - which is used to re-order the execution order of the testing script/methods - with various combination of inputs. - called after collection has been performed, may filter or re-order the items in-place. - Parameters: - items (List[_pytest.nodes.Item]) list of item objects - ---------- - Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks - """ - run_first = ["test_export", "test_infer"] - modules_name = {item.module.__name__ for item in items} - cloud_modules = [] - non_cloud_modules = [] - for module in modules_name: - if module in run_first: - cloud_modules.append(module) - else: - non_cloud_modules.append(module) - - if len(cloud_modules) > 1: - modules = {item: item.module.__name__ for item in items} - items[:] = sorted(items, key=lambda x: run_first.index(modules[x]) if modules[x] in run_first else len(items)) - - non_cloud_tests = [] - - for itm in items: - if modules[itm] not in cloud_modules: - non_cloud_tests.append(itm) - - num_cloud_tests = len(items) - len(non_cloud_tests) - num_cloud_test_cases = num_cloud_tests // len(cloud_modules) - final_items = [] - - for i in range(num_cloud_test_cases): - for j in range(len(cloud_modules)): - final_items.append(items[i + j * num_cloud_test_cases]) - - final_items.extend(non_cloud_tests) - items[:] = final_items - - if config.getoption("--all"): - return - - first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None - - for item in items: - if item.module.__name__ in ["test_export", "test_compile_and_execute", "test_infer"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if not params["enable_qnn"] and params["qnn_config"] is not None: - item.add_marker( - pytest.mark.skip(reason="Skipping because same as enable_qnn = false and qnn_config = None") - ) - if params["enable_qnn"]: - item.add_marker(pytest.mark.qnn) - - if item.module.__name__ in ["test_export", "test_compile_and_execute"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if params["model_name"] != first_model: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - if params["prompt_len"] == 2: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - - if item.module.__name__ in ["test_infer"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if params["prompt_len"] == 2 and params["model_name"] != first_model: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - - -def qeff_models_clean_up(): - if os.path.exists(QEFF_MODELS_DIR): - shutil.rmtree(QEFF_MODELS_DIR) - logger.info(f"\n.............Cleaned up {QEFF_MODELS_DIR}") - - -@pytest.fixture -def clean_up_after_test(): - yield - qeff_models_clean_up() - - -def pytest_sessionstart(session): - logger.info("PYTEST Session Starting ...") - qeff_models_clean_up() - - -def pytest_sessionfinish(session, exitstatus): - inside_worker = getattr(session.config, "workerinput", None) - if inside_worker is None: - qeff_models_clean_up() - logger.info("...PYTEST Session Ended.") diff --git a/tests/cloud/high_level_testing.json b/tests/cloud/high_level_testing.json deleted file mode 100644 index d30382dc6..000000000 --- a/tests/cloud/high_level_testing.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "license": "SEE LICENSE IN LICENSE FILE", - "model_name" : ["gpt2"], - "num_cores" : [16], - "prompt" : ["My name is"], - "prompts_txt_file_path" : ["examples/prompts.txt"], - "aic_enable_depth_first" : [1], - "mos" : [1], - "cache_dir" : [null], - "hf_token" : [null], - "batch_size" : [1], - "prompt_len" : [32], - "ctx_len" : [128], - "mxfp6" : [1], - "mxint8" : [1], - "device_group" : [null], - "full_batch_size" : [null,3], - "enable_qnn" : [false, true], - "qnn_config" : [null, "QEfficient/compile/qnn_config.json"] -} diff --git a/tests/cloud/test_compile_and_execute.py b/tests/cloud/test_compile_and_execute.py deleted file mode 100644 index 341d63bb7..000000000 --- a/tests/cloud/test_compile_and_execute.py +++ /dev/null @@ -1,80 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import os - -import pytest -import yaml - -import QEfficient -from QEfficient.cloud.execute import main as execute -from QEfficient.cloud.export import get_onnx_model_path - - -@pytest.mark.on_qaic -@pytest.mark.cli -def test_compile(setup, mocker): - """ - test_compile is a HL compile api testing function, - checks compile api code flow, object creations, internal api calls, internal returns. - --------- - Parameters: - setup: is a fixture defined in conftest.py module. - mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. - """ - ms = setup - onnx_model_path = get_onnx_model_path( - model_name=ms.model_name, - cache_dir=ms.cache_dir, - hf_token=ms.hf_token, - full_batch_size=ms.full_batch_size, - local_model_dir=ms.local_model_dir, - ) - - base_key = "past_key." - base_value = "past_value." - precision = "float16" - - data = [] - - for i in range(12): - data.append({"IOName": f"{base_key}{i}", "Precision": precision}) - data.append({"IOName": f"{base_value}{i}", "Precision": precision}) - - for i in range(12): - data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) - data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) - - with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file: - yaml.dump(data, file) - - qpc_path = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(ms.qpc_dir_path()), - num_cores=ms.num_cores, - device_group=ms.device_group, - custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml", - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - batch_size=ms.batch_size, - prompt_len=ms.prompt_len, - ctx_len=ms.ctx_len, - mxfp6=ms.mxfp6, - mxint8=ms.mxint8, - full_batch_size=ms.full_batch_size, - enable_qnn=ms.enable_qnn, - ) - - execute( - model_name=ms.model_name, - qpc_path=qpc_path, - prompt=ms.prompt, - prompts_txt_file_path=ms.prompts_txt_file_path, - generation_len=ms.generation_len, - hf_token=ms.hf_token, - full_batch_size=ms.full_batch_size, - ) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py deleted file mode 100644 index df5b12f5e..000000000 --- a/tests/cloud/test_export.py +++ /dev/null @@ -1,31 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - - -import pytest - -from QEfficient.cloud.export import main as export - - -@pytest.mark.cli -def test_export(setup, mocker): - """ - test_export is a HL export api testing function, - checks export api code flow, object creations, internal api calls, internal returns. - --------- - Parameters: - setup: is a fixture defined in conftest.py module. - mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. - """ - ms = setup - - export( - model_name=ms.model_name, - hf_token=ms.hf_token, - local_model_dir=ms.local_model_dir, - full_batch_size=ms.full_batch_size, - ) diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py new file mode 100644 index 000000000..7cac59da7 --- /dev/null +++ b/tests/cloud/test_export_compile_execute.py @@ -0,0 +1,115 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import os + +import pytest +import yaml + +import QEfficient +from QEfficient.cloud.execute import main as execute +from QEfficient.cloud.export import main as export + + +def check_export_compile_execute(mocker, model_name, full_batch_size=None, enable_qnn=False): + check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute, "load_hf_tokenizer") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute, "cloud_ai_100_exec_kv") + + # Export model + export( + model_name=model_name, + full_batch_size=full_batch_size, + ) + + check_and_assign_cache_dir_spy.assert_called_once() + get_onnx_model_path_spy.assert_called_once() + + onnx_model_path = get_onnx_model_path_spy.spy_return + + assert os.path.isfile(onnx_model_path) + + base_key = "past_key." + base_value = "past_value." + precision = "float16" + + data = [] + + for i in range(12): + data.append({"IOName": f"{base_key}{i}", "Precision": precision}) + data.append({"IOName": f"{base_value}{i}", "Precision": precision}) + + for i in range(12): + data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) + data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) + + with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file: + yaml.dump(data, file) + # Compile model + qpc_path = QEfficient.compile( + onnx_path=onnx_model_path, + qpc_path=os.path.dirname(onnx_model_path), + num_cores=16, + device_group=None, + custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml", + aic_enable_depth_first=True, + mos=1, + batch_size=1, + prompt_len=32, + ctx_len=128, + mxfp6=True, + mxint8=True, + full_batch_size=full_batch_size, + enable_qnn=enable_qnn, + ) + + assert os.path.isdir(qpc_path) + + # Execute model + execute( + model_name=model_name, + qpc_path=qpc_path, + prompt="My name is", + prompts_txt_file_path="examples/prompts.txt", + generation_len=20, + full_batch_size=full_batch_size, + ) + + load_hf_tokenizer_spy.assert_called_once() + cloud_ai_100_exec_kv_spy.assert_called_once() + + +@pytest.mark.on_qaic +@pytest.mark.cli +def test_export_compile_execute(mocker): + # testing export -> compile -> infer without full_batch_size + check_export_compile_execute(mocker, model_name="gpt2") + + +@pytest.mark.on_qaic +@pytest.mark.cli +def test_export_compile_execute_fbs(mocker): + # testing export -> compile -> infer with full_batch_size + check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.cli +def test_export_compile_execute_qnn(mocker): + # testing export -> compile -> infer without full_batch_size in QNN environment + check_export_compile_execute(mocker, model_name="gpt2", enable_qnn=True) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.cli +def test_export_compile_execute_qnn_fbs(mocker): + # testing export -> compile -> infer with full_batch_size in QNN environment + check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3, enable_qnn=True) diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 396d9609d..9addc0a7b 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,16 +5,51 @@ # # ----------------------------------------------------------------------------- - import pytest +import QEfficient from QEfficient.cloud.infer import main as infer +def check_infer( + mocker, model_name, prompt="My name is", full_batch_size=None, enable_qnn=False, image_url=None, generation_len=20 +): + check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.infer, "check_and_assign_cache_dir") + qeff_model_load_spy = mocker.spy(QEfficient.cloud.infer.QEFFCommonLoader, "from_pretrained") + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") + execute_vlm_model_spy = mocker.spy(QEfficient.cloud.infer, "execute_vlm_model") + + infer( + model_name=model_name, + num_cores=16, + prompt=prompt, + local_model_dir=None, + prompts_txt_file_path="examples/prompts.txt", + aic_enable_depth_first=True, + mos=1, + hf_token=None, + batch_size=1, + prompt_len=32, + ctx_len=128, + generation_len=generation_len, + mxfp6=True, + mxint8=True, + full_batch_size=full_batch_size, + enable_qnn=enable_qnn, + image_url=image_url, + ) + + check_and_assign_cache_dir_spy.assert_called_once() + qeff_model_load_spy.assert_called_once() + if image_url is not None: + execute_vlm_model_spy.assert_called_once() + else: + load_hf_tokenizer_spy.assert_called_once() + + @pytest.mark.on_qaic @pytest.mark.cli -@pytest.mark.usefixtures("clean_up_after_test") -def test_infer(setup, mocker): +def test_infer(mocker): """ test_infer is a HL infer api testing function, checks infer api code flow, object creations, internal api calls, internal returns. @@ -26,22 +61,41 @@ def test_infer(setup, mocker): Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ - ms = setup - infer( - model_name=ms.model_name, - num_cores=ms.num_cores, - prompt=ms.prompt, - local_model_dir=ms.local_model_dir, - prompts_txt_file_path=ms.prompts_txt_file_path, - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - hf_token=ms.hf_token, - batch_size=ms.batch_size, - prompt_len=ms.prompt_len, - ctx_len=ms.ctx_len, - generation_len=ms.generation_len, - mxfp6=ms.mxfp6, - mxint8=ms.mxint8, - full_batch_size=ms.full_batch_size, - enable_qnn=ms.enable_qnn, + # testing infer without full_batch_size + check_infer(mocker, model_name="lu-vae/llama-68m-fft") + + +@pytest.mark.on_qaic +@pytest.mark.cli +def test_infer_fbs(mocker): + # testing infer with full_batch_size + check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.qnn +def test_infer_qnn(mocker): + # testing infer without full_batch_size in QNN environment + check_infer(mocker, model_name="lu-vae/llama-68m-fft", enable_qnn=True) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.qnn +def test_infer_qnn_fbs(mocker): + # testing infer with full_batch_size in QNN environment + check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3, enable_qnn=True) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.multimodal +def test_infer_vlm(mocker): + # testing infer for MM models + check_infer( + mocker, + model_name="llava-hf/llava-1.5-7b-hf", + prompt="Describe the image.", + image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", ) diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py deleted file mode 100644 index 94adb3f36..000000000 --- a/tests/cloud/test_infer_vlm.py +++ /dev/null @@ -1,41 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import pytest - -from QEfficient.cloud.infer import main as infer - - -@pytest.mark.on_qaic -@pytest.mark.cli -@pytest.mark.multimodal -@pytest.mark.usefixtures("clean_up_after_test") -def test_vlm_cli(setup, mocker): - ms = setup - # Taking some values from setup fixture and assigning other's based on model's requirement. - # For example, mxint8 is not required for VLM models, so assigning False. - infer( - model_name="llava-hf/llava-1.5-7b-hf", - num_cores=ms.num_cores, - prompt="Describe the image.", - prompts_txt_file_path=None, - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - batch_size=1, - full_batch_size=None, - prompt_len=1024, - ctx_len=2048, - generation_len=20, - mxfp6=False, - mxint8=False, - local_model_dir=None, - cache_dir=None, - hf_token=ms.hf_token, - enable_qnn=False, - qnn_config=None, - image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", - ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..ba0f341fe --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,65 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import json +import os +import shutil + +import pytest +from transformers import AutoConfig + +from QEfficient.utils.constants import QEFF_MODELS_DIR +from QEfficient.utils.logging_utils import logger +from QEfficient.utils.test_utils import ModelConfig + + +def get_custom_model_config_dict(configs): + """ + Converts a list of custom model configuration dictionaries into a dictionary + mapping model names to their corresponding AutoConfig objects. + + Args: + configs (List[Dict]): A list of dictionaries, each containing model configuration parameters. + + Returns: + Dict[str, AutoConfig]: A dictionary where keys are model names and values are AutoConfig objects. + """ + config_dict = {} + for config in configs: + model_name = config["model_name"] + config_dict[model_name] = AutoConfig.from_pretrained( + model_name, + trust_remote_code=config["model_name"] in ModelConfig.EXTERNAL_MODELS, + **config.get("additional_params", {}), + ) + return config_dict + + +# Pytest fixture to load custom model configs from a JSON file +@pytest.fixture(scope="session") +def custom_causal_model_config_dict(): + with open("tests/transformers/models/custom_tiny_model_configs.json", "r") as f: + custom_model_configs_data = json.load(f) + return get_custom_model_config_dict(custom_model_configs_data) + + +def qeff_models_clean_up(): + if os.path.exists(QEFF_MODELS_DIR): + shutil.rmtree(QEFF_MODELS_DIR) + logger.info(f"\n.............Cleaned up {QEFF_MODELS_DIR}") + + +def pytest_sessionstart(session): + logger.info("PYTEST Session Starting ...") + qeff_models_clean_up() + + +def pytest_sessionfinish(session, exitstatus): + inside_worker = getattr(session.config, "workerinput", None) + if inside_worker is None: + qeff_models_clean_up() + logger.info("...PYTEST Session Ended.") diff --git a/tests/transformers/models/custom_tiny_model_configs.json b/tests/transformers/models/custom_tiny_model_configs.json new file mode 100644 index 000000000..2da2c95a3 --- /dev/null +++ b/tests/transformers/models/custom_tiny_model_configs.json @@ -0,0 +1,317 @@ +[ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Salesforce/codegen-350M-mono", + "model_type": "codegen", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 4, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 51200, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + { + "model_name": "microsoft/Phi-3-mini-4k-instruct", + "model_type": "phi3", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32064, + "num_key_value_heads": 1 + } + }, + { + "model_name": "tiiuae/falcon-7b", + "model_type": "falcon", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 65024, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + }, + { + "model_name": "bigcode/starcoder2-3b", + "model_type": "starcoder2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Felladrin/Minueza-32M-Base", + "model_type": "mistral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32002, + "num_key_value_heads": 1 + } + }, + { + "model_name": "wtang06/mpt-125m-c4", + "model_type": "mpt", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50368 + } + }, + { + "model_name": "hakurei/gpt-j-random-tinier", + "model_type": "gptj", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50400, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "unsloth/gemma-2-2b", + "model_type": "gemma2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32003 + } + }, + { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000 + } + }, + { + "model_name": "ibm-granite/granite-20b-code-base", + "model_type": "gpt_bigcode", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1, + "activation_function": "gelu", + "architectures": [ + "GPTBigCodeForCausalLM" + ] + } + }, + { + "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256 + } + }, + { + "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936 + } + }, + { + "model_name": "ibm-granite/granite-3.1-2b-instruct", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "hpcai-tech/grok-1", + "model_type": null, + "additional_params":{ + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 131072, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", + "model_type": null, + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 256, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_layers": 1, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + } +] diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 3195c4828..77354ee23 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -5,13 +5,14 @@ # # ----------------------------------------------------------------------------- +import copy import os from typing import Optional import numpy as np import pytest import torch -from transformers import AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM @@ -21,9 +22,9 @@ from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id from QEfficient.utils.run_utils import ApiRunner +from QEfficient.utils.test_utils import ModelConfig -extrenal_models = {"hpcai-tech/grok-1"} -test_models_qaic = [ +test_models_causal = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "gpt2", "Salesforce/codegen-350M-mono", @@ -47,6 +48,7 @@ "ibm-granite/granite-3.1-2b-instruct", "ibm-granite/granite-guardian-3.1-2b", "hpcai-tech/grok-1", + "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", ] test_models_qnn = [ @@ -56,35 +58,73 @@ "ibm-granite/granite-guardian-3.1-2b", ] -spd_test_models = [ +test_models_spd = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "Qwen/Qwen2-0.5B", ] -def load_causal_lm_model(model_config): +def get_custom_n_layers(model_name): + """ + Function to set number layers of the variuos types of models such as swiftkv models and others + -------- + + :model_name: str + + :return n_layer + """ + if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8"}: + return 2 + elif model_name in ModelConfig.SWIFTKV_MODELS: + return None + return 1 + + +def load_causal_lm_model(model_name, n_layer=1, config=None): """ Function to load model from huggingface and transform to KV model -------- - :model_config: Dict + :model_name: str + :n_layer: int + :config: Autoconfig :return model_hf, params """ + torch.manual_seed(42) model_path = hf_download( - repo_id=model_config["model_name"], + repo_id=model_name, ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], ) - model_hf = AutoModelForCausalLM.from_pretrained( - model_path, - use_cache=True, - num_hidden_layers=model_config["n_layer"], - attn_implementation="eager", - low_cpu_mem_usage=False, - trust_remote_code=model_config["model_name"] in extrenal_models, - ) - # Convert to FP32 if model is in BF16 - if getattr(model_hf.config, "torch_dtype", None) == torch.bfloat16: + if config is None: # If custom config is not provided, load the model config from Hugging Face + if n_layer is not None: + # If n_layer is specified, load the model with that many layers + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + use_cache=True, + num_hidden_layers=n_layer, + attn_implementation="eager", + low_cpu_mem_usage=False, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + ) + else: + # If n_layer is not specified, load the model without specifying the number of layers + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + use_cache=True, + attn_implementation="eager", + low_cpu_mem_usage=False, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + ) + else: # If custom config is provided, load the model using the config + model_hf = AutoModelForCausalLM.from_config( + config, + attn_implementation="eager", + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + ) + # Convert to FP32 if model is in BF16 or in FP16 + torch_dtype = getattr(model_hf.config, "torch_dtype", None) + if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16: model_hf = model_hf.to(torch.float32) params = sum(p.numel() for p in model_hf.parameters()) @@ -101,6 +141,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( prefill_only: Optional[bool] = None, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + config: Optional[AutoConfig] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -111,10 +152,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( :n_layers (int): Number of layers for the Model. """ replace_transformers_quantizers() - model_config = {"model_name": model_name} - model_config["n_layer"] = n_layer - - model_hf, _ = load_causal_lm_model(model_config) + if config is None: + model_hf, _ = load_causal_lm_model(model_name, n_layer=n_layer) + else: + model_hf, _ = load_causal_lm_model(model_name, config=config) tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) config = model_hf.config @@ -128,16 +169,19 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, ) - pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) + if model_name not in ModelConfig.SWIFTKV_MODELS: + pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True - qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name) - + qeff_model = QEFFAutoModelForCausalLM( + copy.deepcopy(model_hf), is_tlm=is_tlm, pretrained_model_name_or_path=model_name + ) pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - "Tokens don't match for HF PyTorch model output and KV PyTorch model output" - ) + if model_name not in ModelConfig.SWIFTKV_MODELS: + assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( + "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + ) onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm) @@ -147,7 +191,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -174,8 +217,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) if prefill_only is not None: return + # testing for CB models - model_hf, _ = load_causal_lm_model(model_config) full_batch_size = 4 fbs_prompts = Constants.INPUT_STR * 4 api_runner = ApiRunner( @@ -188,8 +231,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( full_batch_size, ) - pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) - pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) + if model_name not in ModelConfig.SWIFTKV_MODELS: + pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) + pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) qeff_model = QEFFAutoModelForCausalLM( model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name @@ -213,21 +257,28 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - assert all( - [ - all(pt_token[:24] == cloud_token[:24]) - for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) - ] - ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + if model_name in ModelConfig.SWIFTKV_MODELS: + assert all( + [ + all(ort_token[:24] == cloud_token[:24]) + for ort_token, cloud_token in zip(ort_tokens, exec_info_fbs.generated_ids) + ] + ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + else: + assert all( + [ + all(pt_token[:24] == cloud_token[:24]) + for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) + ] + ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) # FIXME: there should be a CB test here @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) def test_causal_lm_export_with_deprecated_api(model_name): - model_config = {"model_name": model_name} - model_config["n_layer"] = 1 - model, _ = load_causal_lm_model(model_config) + model, _ = load_causal_lm_model(model_name, n_layer=1) tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) new_api_onnx_model_path = qeff_model.export() @@ -253,58 +304,106 @@ def test_causal_lm_export_with_deprecated_api(model_name): @pytest.mark.on_qaic -@pytest.mark.parametrize("model_name", test_models_qaic) +@pytest.mark.regular +@pytest.mark.parametrize("model_name", test_models_causal) +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): + """ + Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + config = custom_causal_model_config_dict.get(model_name) + + if model_name in ModelConfig.QUANTIZED_MODELS: + n_layer = get_custom_n_layers(model_name) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer) + else: + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=config) + + +@pytest.mark.nightly +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", test_models_causal) def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8"}: - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 + n_layer = get_custom_n_layers(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) +@pytest.mark.on_qaic +@pytest.mark.regular +@pytest.mark.qnn +@pytest.mark.parametrize("model_name", test_models_qnn) +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, custom_causal_model_config_dict): + """ + QNN Setup + Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + config = custom_causal_model_config_dict.get(model_name) + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=config + ) + + +@pytest.mark.nightly @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_qnn) def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): """ - QNN Compilation Test + QNN Setup Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - if model_name == "microsoft/Phi-3-mini-4k-instruct": - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + n_layer = get_custom_n_layers(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path ) -@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model +@pytest.mark.regular @pytest.mark.on_qaic -@pytest.mark.parametrize("model_name", spd_test_models) -def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): +@pytest.mark.qnn +@pytest.mark.parametrize("model_name", test_models_spd) +def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): """ - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + Test function to validate the dummy PyTorch model for speculative decoding, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ + config = custom_causal_model_config_dict.get(model_name) - if model_name == "microsoft/Phi-3-mini-4k-instruct": - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, + config=config, + ) + + +@pytest.mark.nightly +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", test_models_spd) +def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + """ + Test function to validate the PyTorch model for speculative decoding, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + n_layer = get_custom_n_layers(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS