From b9fd90fa07f8c8dce0da3e6f75138719f1bf9a22 Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Thu, 2 Apr 2026 22:46:45 +0000 Subject: [PATCH 1/8] Add webgpu recipes for qwen3.5 models --- Qwen-Qwen3.5-0.8B/builtin/optimize.py | 5 +- .../builtin/webgpu/embedding.json | 58 ++++++++++++ Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json | 33 +++++++ Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json | 89 +++++++++++++++++++ Qwen-Qwen3.5-2B/builtin/optimize.py | 5 +- Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json | 58 ++++++++++++ Qwen-Qwen3.5-2B/builtin/webgpu/text.json | 33 +++++++ Qwen-Qwen3.5-2B/builtin/webgpu/vision.json | 89 +++++++++++++++++++ Qwen-Qwen3.5-4B/builtin/optimize.py | 5 +- Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json | 58 ++++++++++++ Qwen-Qwen3.5-4B/builtin/webgpu/text.json | 33 +++++++ Qwen-Qwen3.5-4B/builtin/webgpu/vision.json | 89 +++++++++++++++++++ Qwen-Qwen3.5-9B/builtin/optimize.py | 5 +- Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json | 58 ++++++++++++ Qwen-Qwen3.5-9B/builtin/webgpu/text.json | 33 +++++++ Qwen-Qwen3.5-9B/builtin/webgpu/vision.json | 89 +++++++++++++++++++ 16 files changed, 736 insertions(+), 4 deletions(-) create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/text.json create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/vision.json create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/text.json create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/vision.json create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/text.json create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/vision.json diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py index 245f41e5..a6998590 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py +++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py @@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..854afde8 --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json @@ -0,0 +1,58 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-0.8B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json new file mode 100644 index 00000000..8faa45b3 --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json @@ -0,0 +1,33 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-0.8B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_block_size": 32, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json new file mode 100644 index 00000000..2223890b --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json @@ -0,0 +1,89 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-0.8B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py index 245f41e5..a6998590 100644 --- a/Qwen-Qwen3.5-2B/builtin/optimize.py +++ b/Qwen-Qwen3.5-2B/builtin/optimize.py @@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..fd89671a --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json @@ -0,0 +1,58 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-2B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json new file mode 100644 index 00000000..62f57815 --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json @@ -0,0 +1,33 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_block_size": 32, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json new file mode 100644 index 00000000..f1933eb3 --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json @@ -0,0 +1,89 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-2B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py index 245f41e5..a6998590 100644 --- a/Qwen-Qwen3.5-4B/builtin/optimize.py +++ b/Qwen-Qwen3.5-4B/builtin/optimize.py @@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..950cd40b --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json @@ -0,0 +1,58 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-4B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json new file mode 100644 index 00000000..c8cfbd31 --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json @@ -0,0 +1,33 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-4B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_block_size": 32, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json new file mode 100644 index 00000000..21782186 --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json @@ -0,0 +1,89 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-4B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py index 245f41e5..a6998590 100644 --- a/Qwen-Qwen3.5-9B/builtin/optimize.py +++ b/Qwen-Qwen3.5-9B/builtin/optimize.py @@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..95ee0c99 --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json @@ -0,0 +1,58 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-9B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json new file mode 100644 index 00000000..aef39302 --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json @@ -0,0 +1,33 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-9B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_block_size": 32, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json new file mode 100644 index 00000000..5fbba4e2 --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json @@ -0,0 +1,89 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-9B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} From d1cd7fc93a257c47086654afcf431e1aac395c96 Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Thu, 2 Apr 2026 23:14:57 +0000 Subject: [PATCH 2/8] Update session options --- Qwen-Qwen3.5-0.8B/builtin/optimize.py | 2 ++ Qwen-Qwen3.5-2B/builtin/optimize.py | 2 ++ Qwen-Qwen3.5-4B/builtin/optimize.py | 2 ++ Qwen-Qwen3.5-9B/builtin/optimize.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py index a6998590..a73685c5 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py +++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py @@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options} vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py index a6998590..a73685c5 100644 --- a/Qwen-Qwen3.5-2B/builtin/optimize.py +++ b/Qwen-Qwen3.5-2B/builtin/optimize.py @@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options} vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py index a6998590..a73685c5 100644 --- a/Qwen-Qwen3.5-4B/builtin/optimize.py +++ b/Qwen-Qwen3.5-4B/builtin/optimize.py @@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options} vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py index a6998590..a73685c5 100644 --- a/Qwen-Qwen3.5-9B/builtin/optimize.py +++ b/Qwen-Qwen3.5-9B/builtin/optimize.py @@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options} vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, From 5e11fd7ed8e2ef6f663cd6e46357d2b30614c6bd Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Mon, 6 Apr 2026 20:16:08 -0700 Subject: [PATCH 3/8] Update block_size for cpu models --- Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json | 1 + Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json | 1 + Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json | 1 + Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json | 1 + 4 files changed, 4 insertions(+) diff --git a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json index a139e9a2..f5f7d772 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json @@ -7,6 +7,7 @@ "m": { "type": "ModelBuilder", "precision": "int4", + "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json index c1adffbf..d166d66a 100644 --- a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json @@ -7,6 +7,7 @@ "m": { "type": "ModelBuilder", "precision": "int4", + "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json index 2429198b..9fe3898b 100644 --- a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json @@ -7,6 +7,7 @@ "m": { "type": "ModelBuilder", "precision": "int4", + "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json index d82c9035..404733e6 100644 --- a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json @@ -7,6 +7,7 @@ "m": { "type": "ModelBuilder", "precision": "int4", + "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" From e7552a082eec48c9da9d91a5be914f09ce73c317 Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Wed, 8 Apr 2026 12:38:42 -0700 Subject: [PATCH 4/8] Do not override context_length --- Qwen-Qwen3.5-0.8B/builtin/optimize.py | 2 -- Qwen-Qwen3.5-2B/builtin/optimize.py | 2 -- Qwen-Qwen3.5-4B/builtin/optimize.py | 2 -- Qwen-Qwen3.5-9B/builtin/optimize.py | 2 -- 4 files changed, 8 deletions(-) diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py index a73685c5..c3b9d9d3 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py +++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py @@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): } config["model"]["bos_token_id"] = 248044 - config["model"]["context_length"] = 4096 config["model"]["eos_token_id"] = [248044] config["model"]["pad_token_id"] = 248044 config["model"]["image_token_id"] = 248056 config["model"]["video_token_id"] = 248057 config["model"]["vision_start_token_id"] = 248053 - config["search"]["max_length"] = 4096 config["search"]["top_k"] = 1 if config["search"].get("top_p") is None: config["search"]["top_p"] = 1.0 diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py index a73685c5..c3b9d9d3 100644 --- a/Qwen-Qwen3.5-2B/builtin/optimize.py +++ b/Qwen-Qwen3.5-2B/builtin/optimize.py @@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): } config["model"]["bos_token_id"] = 248044 - config["model"]["context_length"] = 4096 config["model"]["eos_token_id"] = [248044] config["model"]["pad_token_id"] = 248044 config["model"]["image_token_id"] = 248056 config["model"]["video_token_id"] = 248057 config["model"]["vision_start_token_id"] = 248053 - config["search"]["max_length"] = 4096 config["search"]["top_k"] = 1 if config["search"].get("top_p") is None: config["search"]["top_p"] = 1.0 diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py index a73685c5..c3b9d9d3 100644 --- a/Qwen-Qwen3.5-4B/builtin/optimize.py +++ b/Qwen-Qwen3.5-4B/builtin/optimize.py @@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): } config["model"]["bos_token_id"] = 248044 - config["model"]["context_length"] = 4096 config["model"]["eos_token_id"] = [248044] config["model"]["pad_token_id"] = 248044 config["model"]["image_token_id"] = 248056 config["model"]["video_token_id"] = 248057 config["model"]["vision_start_token_id"] = 248053 - config["search"]["max_length"] = 4096 config["search"]["top_k"] = 1 if config["search"].get("top_p") is None: config["search"]["top_p"] = 1.0 diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py index a73685c5..c3b9d9d3 100644 --- a/Qwen-Qwen3.5-9B/builtin/optimize.py +++ b/Qwen-Qwen3.5-9B/builtin/optimize.py @@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): } config["model"]["bos_token_id"] = 248044 - config["model"]["context_length"] = 4096 config["model"]["eos_token_id"] = [248044] config["model"]["pad_token_id"] = 248044 config["model"]["image_token_id"] = 248056 config["model"]["video_token_id"] = 248057 config["model"]["vision_start_token_id"] = 248053 - config["search"]["max_length"] = 4096 config["search"]["top_k"] = 1 if config["search"].get("top_p") is None: config["search"]["top_p"] = 1.0 From 4f12db20722f684a9366973f3cf0c64bd1836c7e Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Wed, 22 Apr 2026 22:42:03 +0000 Subject: [PATCH 5/8] Remove engine from webgpu --- Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json | 13 ------------- Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json | 14 -------------- Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json | 13 ------------- Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json | 13 ------------- Qwen-Qwen3.5-2B/builtin/webgpu/text.json | 14 -------------- Qwen-Qwen3.5-2B/builtin/webgpu/vision.json | 13 ------------- Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json | 13 ------------- Qwen-Qwen3.5-4B/builtin/webgpu/text.json | 14 -------------- Qwen-Qwen3.5-4B/builtin/webgpu/vision.json | 13 ------------- Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json | 13 ------------- Qwen-Qwen3.5-9B/builtin/webgpu/text.json | 14 -------------- Qwen-Qwen3.5-9B/builtin/webgpu/vision.json | 13 ------------- 12 files changed, 160 deletions(-) diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json index 854afde8..291aa464 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json @@ -40,19 +40,6 @@ "external_data_name": "embedding.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/embedding.onnx" } diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json index 8faa45b3..c8647228 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json @@ -8,26 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, - "int4_block_size": 32, "extra_options": { "filename": "text.onnx", "prune_lm_head": true } } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/text.onnx" } diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json index 2223890b..6e6ef5fc 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json @@ -71,19 +71,6 @@ "external_data_name": "vision.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/vision.onnx" } diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json index fd89671a..4490a781 100644 --- a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json @@ -40,19 +40,6 @@ "external_data_name": "embedding.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/embedding.onnx" } diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json index 62f57815..3b23c40f 100644 --- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json @@ -8,26 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, - "int4_block_size": 32, "extra_options": { "filename": "text.onnx", "prune_lm_head": true } } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/text.onnx" } diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json index f1933eb3..bded6bf3 100644 --- a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json @@ -71,19 +71,6 @@ "external_data_name": "vision.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/vision.onnx" } diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json index 950cd40b..97e64916 100644 --- a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json @@ -40,19 +40,6 @@ "external_data_name": "embedding.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/embedding.onnx" } diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json index c8cfbd31..47aef6e3 100644 --- a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json @@ -8,26 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, - "int4_block_size": 32, "extra_options": { "filename": "text.onnx", "prune_lm_head": true } } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/text.onnx" } diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json index 21782186..1389d3da 100644 --- a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json @@ -71,19 +71,6 @@ "external_data_name": "vision.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/vision.onnx" } diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json index 95ee0c99..0389dc30 100644 --- a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json @@ -40,19 +40,6 @@ "external_data_name": "embedding.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/embedding.onnx" } diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json index aef39302..38808e3f 100644 --- a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json @@ -8,26 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, - "int4_block_size": 32, "extra_options": { "filename": "text.onnx", "prune_lm_head": true } } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/text.onnx" } diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json index 5fbba4e2..dfe0772f 100644 --- a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json @@ -71,19 +71,6 @@ "external_data_name": "vision.onnx.data" } }, - "engine": { - "target": { - "type": "LocalSystem", - "accelerators": [ - { - "device": "gpu", - "execution_providers": [ - "WebGpuExecutionProvider" - ] - } - ] - } - }, "no_artifacts": true, "output_dir": "webgpu/models/vision.onnx" } From f8e6859b4dd621a08fed7252ab91104e3726b1c6 Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Wed, 22 Apr 2026 23:07:05 +0000 Subject: [PATCH 6/8] remove int4_block_size --- Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json | 1 - Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json | 1 - Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json | 1 - Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json | 1 - 4 files changed, 4 deletions(-) diff --git a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json index f5f7d772..a139e9a2 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json @@ -7,7 +7,6 @@ "m": { "type": "ModelBuilder", "precision": "int4", - "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json index d166d66a..c1adffbf 100644 --- a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json @@ -7,7 +7,6 @@ "m": { "type": "ModelBuilder", "precision": "int4", - "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json index 9fe3898b..2429198b 100644 --- a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json @@ -7,7 +7,6 @@ "m": { "type": "ModelBuilder", "precision": "int4", - "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" diff --git a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json index 404733e6..d82c9035 100644 --- a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json +++ b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json @@ -7,7 +7,6 @@ "m": { "type": "ModelBuilder", "precision": "int4", - "int4_block_size": 128, "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx" From ebf81572f871a25719e5bb3603c9f8f21279e432 Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Wed, 22 Apr 2026 23:36:19 +0000 Subject: [PATCH 7/8] Update recipes --- Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json | 3 ++- Qwen-Qwen3.5-2B/builtin/webgpu/text.json | 5 ++++- Qwen-Qwen3.5-4B/builtin/webgpu/text.json | 5 ++++- Qwen-Qwen3.5-9B/builtin/webgpu/text.json | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json index c8647228..de121e31 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json @@ -10,7 +10,8 @@ "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx", - "prune_lm_head": true + "prune_lm_head": true, + "quant_mode": "default" } } }, diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json index 3b23c40f..7e1accfc 100644 --- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json @@ -8,9 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, + "int4_algo_config": "k_quant_mixed", + "int4_block_size": 128, "extra_options": { "filename": "text.onnx", - "prune_lm_head": true + "prune_lm_head": true, + "quant_mode": "int4" } } }, diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json index 47aef6e3..6e79cba5 100644 --- a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json @@ -8,9 +8,12 @@ "type": "ModelBuilder", "precision": "int4", "int4_accuracy_level": 4, + "int4_block_size": 32, + "int4_algo_config": "rtn", "extra_options": { "filename": "text.onnx", - "prune_lm_head": true + "prune_lm_head": true, + "quant_mode": "int4" } } }, diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json index 38808e3f..9f8b9488 100644 --- a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json @@ -10,7 +10,8 @@ "int4_accuracy_level": 4, "extra_options": { "filename": "text.onnx", - "prune_lm_head": true + "prune_lm_head": true, + "quant_mode": "int4" } } }, From 02903aa8c1810bbffa4beb2b7f33d5104d89729e Mon Sep 17 00:00:00 2001 From: Akshay Sonawane Date: Thu, 23 Apr 2026 22:07:04 +0000 Subject: [PATCH 8/8] Update block_size --- Qwen-Qwen3.5-2B/builtin/webgpu/text.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json index 7e1accfc..a7976f7d 100644 --- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json @@ -9,7 +9,7 @@ "precision": "int4", "int4_accuracy_level": 4, "int4_algo_config": "k_quant_mixed", - "int4_block_size": 128, + "int4_block_size": 32, "extra_options": { "filename": "text.onnx", "prune_lm_head": true,