From b9fd90fa07f8c8dce0da3e6f75138719f1bf9a22 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 2 Apr 2026 22:46:45 +0000
Subject: [PATCH 1/8] Add webgpu recipes for qwen3.5 models

---
 Qwen-Qwen3.5-0.8B/builtin/optimize.py         |  5 +-
 .../builtin/webgpu/embedding.json             | 58 ++++++++++++
 Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json    | 33 +++++++
 Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json  | 89 +++++++++++++++++++
 Qwen-Qwen3.5-2B/builtin/optimize.py           |  5 +-
 Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json | 58 ++++++++++++
 Qwen-Qwen3.5-2B/builtin/webgpu/text.json      | 33 +++++++
 Qwen-Qwen3.5-2B/builtin/webgpu/vision.json    | 89 +++++++++++++++++++
 Qwen-Qwen3.5-4B/builtin/optimize.py           |  5 +-
 Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json | 58 ++++++++++++
 Qwen-Qwen3.5-4B/builtin/webgpu/text.json      | 33 +++++++
 Qwen-Qwen3.5-4B/builtin/webgpu/vision.json    | 89 +++++++++++++++++++
 Qwen-Qwen3.5-9B/builtin/optimize.py           |  5 +-
 Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json | 58 ++++++++++++
 Qwen-Qwen3.5-9B/builtin/webgpu/text.json      | 33 +++++++
 Qwen-Qwen3.5-9B/builtin/webgpu/vision.json    | 89 +++++++++++++++++++
 16 files changed, 736 insertions(+), 4 deletions(-)
 create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
 create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
 create mode 100644 Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
 create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
 create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/text.json
 create mode 100644 Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
 create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
 create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/text.json
 create mode 100644 Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
 create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
 create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/text.json
 create mode 100644 Qwen-Qwen3.5-9B/builtin/webgpu/vision.json

diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
index 245f41e5..a6998590 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
@@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
new file mode 100644
index 00000000..854afde8
--- /dev/null
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
@@ -0,0 +1,58 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-0.8B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
new file mode 100644
index 00000000..8faa45b3
--- /dev/null
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
@@ -0,0 +1,33 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-0.8B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "int4_block_size": 32,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true
+            }
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
new file mode 100644
index 00000000..2223890b
--- /dev/null
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
@@ -0,0 +1,89 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-0.8B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py
index 245f41e5..a6998590 100644
--- a/Qwen-Qwen3.5-2B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-2B/builtin/optimize.py
@@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
new file mode 100644
index 00000000..fd89671a
--- /dev/null
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
@@ -0,0 +1,58 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
new file mode 100644
index 00000000..62f57815
--- /dev/null
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
@@ -0,0 +1,33 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "int4_block_size": 32,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true
+            }
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
new file mode 100644
index 00000000..f1933eb3
--- /dev/null
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
@@ -0,0 +1,89 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}
diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py
index 245f41e5..a6998590 100644
--- a/Qwen-Qwen3.5-4B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-4B/builtin/optimize.py
@@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
new file mode 100644
index 00000000..950cd40b
--- /dev/null
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
@@ -0,0 +1,58 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-4B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
new file mode 100644
index 00000000..c8cfbd31
--- /dev/null
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
@@ -0,0 +1,33 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-4B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "int4_block_size": 32,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true
+            }
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
new file mode 100644
index 00000000..21782186
--- /dev/null
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
@@ -0,0 +1,89 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-4B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}
diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py
index 245f41e5..a6998590 100644
--- a/Qwen-Qwen3.5-9B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-9B/builtin/optimize.py
@@ -46,6 +46,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
new file mode 100644
index 00000000..95ee0c99
--- /dev/null
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
@@ -0,0 +1,58 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-9B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
new file mode 100644
index 00000000..aef39302
--- /dev/null
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
@@ -0,0 +1,33 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-9B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "int4_block_size": 32,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true
+            }
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json
new file mode 100644
index 00000000..5fbba4e2
--- /dev/null
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json
@@ -0,0 +1,89 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-9B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "WebGpuExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}

From d1cd7fc93a257c47086654afcf431e1aac395c96 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 2 Apr 2026 23:14:57 +0000
Subject: [PATCH 2/8] Update session options

---
 Qwen-Qwen3.5-0.8B/builtin/optimize.py | 2 ++
 Qwen-Qwen3.5-2B/builtin/optimize.py   | 2 ++
 Qwen-Qwen3.5-4B/builtin/optimize.py   | 2 ++
 Qwen-Qwen3.5-9B/builtin/optimize.py   | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
index a6998590..a73685c5 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
@@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py
index a6998590..a73685c5 100644
--- a/Qwen-Qwen3.5-2B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-2B/builtin/optimize.py
@@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py
index a6998590..a73685c5 100644
--- a/Qwen-Qwen3.5-4B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-4B/builtin/optimize.py
@@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py
index a6998590..a73685c5 100644
--- a/Qwen-Qwen3.5-9B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-9B/builtin/optimize.py
@@ -56,6 +56,8 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},

From 5e11fd7ed8e2ef6f663cd6e46357d2b30614c6bd Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Mon, 6 Apr 2026 20:16:08 -0700
Subject: [PATCH 3/8] Update block_size for cpu models

---
 Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json | 1 +
 Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json   | 1 +
 Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json   | 1 +
 Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
index a139e9a2..f5f7d772 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
@@ -7,6 +7,7 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
+            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
index c1adffbf..d166d66a 100644
--- a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
@@ -7,6 +7,7 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
+            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
index 2429198b..9fe3898b 100644
--- a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
@@ -7,6 +7,7 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
+            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
index d82c9035..404733e6 100644
--- a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
@@ -7,6 +7,7 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
+            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"

From e7552a082eec48c9da9d91a5be914f09ce73c317 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 8 Apr 2026 12:38:42 -0700
Subject: [PATCH 4/8] Do not override context_length

---
 Qwen-Qwen3.5-0.8B/builtin/optimize.py | 2 --
 Qwen-Qwen3.5-2B/builtin/optimize.py   | 2 --
 Qwen-Qwen3.5-4B/builtin/optimize.py   | 2 --
 Qwen-Qwen3.5-9B/builtin/optimize.py   | 2 --
 4 files changed, 8 deletions(-)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
index a73685c5..c3b9d9d3 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
@@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0
diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py
index a73685c5..c3b9d9d3 100644
--- a/Qwen-Qwen3.5-2B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-2B/builtin/optimize.py
@@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0
diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py
index a73685c5..c3b9d9d3 100644
--- a/Qwen-Qwen3.5-4B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-4B/builtin/optimize.py
@@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0
diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py
index a73685c5..c3b9d9d3 100644
--- a/Qwen-Qwen3.5-9B/builtin/optimize.py
+++ b/Qwen-Qwen3.5-9B/builtin/optimize.py
@@ -77,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0

From 4f12db20722f684a9366973f3cf0c64bd1836c7e Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 22 Apr 2026 22:42:03 +0000
Subject: [PATCH 5/8] Remove engine from webgpu

---
 Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json | 13 -------------
 Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json      | 14 --------------
 Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json    | 13 -------------
 Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json   | 13 -------------
 Qwen-Qwen3.5-2B/builtin/webgpu/text.json        | 14 --------------
 Qwen-Qwen3.5-2B/builtin/webgpu/vision.json      | 13 -------------
 Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json   | 13 -------------
 Qwen-Qwen3.5-4B/builtin/webgpu/text.json        | 14 --------------
 Qwen-Qwen3.5-4B/builtin/webgpu/vision.json      | 13 -------------
 Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json   | 13 -------------
 Qwen-Qwen3.5-9B/builtin/webgpu/text.json        | 14 --------------
 Qwen-Qwen3.5-9B/builtin/webgpu/vision.json      | 13 -------------
 12 files changed, 160 deletions(-)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
index 854afde8..291aa464 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
@@ -40,19 +40,6 @@
             "external_data_name": "embedding.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/embedding.onnx"
 }
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
index 8faa45b3..c8647228 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
@@ -8,26 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
-            "int4_block_size": 32,
             "extra_options": {
                 "filename": "text.onnx",
                 "prune_lm_head": true
             }
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/text.onnx"
 }
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
index 2223890b..6e6ef5fc 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
@@ -71,19 +71,6 @@
             "external_data_name": "vision.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/vision.onnx"
 }
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
index fd89671a..4490a781 100644
--- a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
@@ -40,19 +40,6 @@
             "external_data_name": "embedding.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/embedding.onnx"
 }
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
index 62f57815..3b23c40f 100644
--- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
@@ -8,26 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
-            "int4_block_size": 32,
             "extra_options": {
                 "filename": "text.onnx",
                 "prune_lm_head": true
             }
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/text.onnx"
 }
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
index f1933eb3..bded6bf3 100644
--- a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
@@ -71,19 +71,6 @@
             "external_data_name": "vision.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/vision.onnx"
 }
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
index 950cd40b..97e64916 100644
--- a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json
@@ -40,19 +40,6 @@
             "external_data_name": "embedding.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/embedding.onnx"
 }
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
index c8cfbd31..47aef6e3 100644
--- a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
@@ -8,26 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
-            "int4_block_size": 32,
             "extra_options": {
                 "filename": "text.onnx",
                 "prune_lm_head": true
             }
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/text.onnx"
 }
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
index 21782186..1389d3da 100644
--- a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json
@@ -71,19 +71,6 @@
             "external_data_name": "vision.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/vision.onnx"
 }
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
index 95ee0c99..0389dc30 100644
--- a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json
@@ -40,19 +40,6 @@
             "external_data_name": "embedding.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/embedding.onnx"
 }
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
index aef39302..38808e3f 100644
--- a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
@@ -8,26 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
-            "int4_block_size": 32,
             "extra_options": {
                 "filename": "text.onnx",
                 "prune_lm_head": true
             }
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/text.onnx"
 }
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json
index 5fbba4e2..dfe0772f 100644
--- a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json
@@ -71,19 +71,6 @@
             "external_data_name": "vision.onnx.data"
         }
     },
-    "engine": {
-        "target": {
-            "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "gpu",
-                    "execution_providers": [
-                        "WebGpuExecutionProvider"
-                    ]
-                }
-            ]
-        }
-    },
     "no_artifacts": true,
     "output_dir": "webgpu/models/vision.onnx"
 }

From f8e6859b4dd621a08fed7252ab91104e3726b1c6 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 22 Apr 2026 23:07:05 +0000
Subject: [PATCH 6/8] remove int4_block_size

---
 Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json | 1 -
 Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json   | 1 -
 Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json   | 1 -
 Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json   | 1 -
 4 files changed, 4 deletions(-)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
index f5f7d772..a139e9a2 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/cpu_and_mobile/text.json
@@ -7,7 +7,6 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
-            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
index d166d66a..c1adffbf 100644
--- a/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-2B/builtin/cpu_and_mobile/text.json
@@ -7,7 +7,6 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
-            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
index 9fe3898b..2429198b 100644
--- a/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-4B/builtin/cpu_and_mobile/text.json
@@ -7,7 +7,6 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
-            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"
diff --git a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
index 404733e6..d82c9035 100644
--- a/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
+++ b/Qwen-Qwen3.5-9B/builtin/cpu_and_mobile/text.json
@@ -7,7 +7,6 @@
         "m": {
             "type": "ModelBuilder",
             "precision": "int4",
-            "int4_block_size": 128,
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx"

From ebf81572f871a25719e5bb3603c9f8f21279e432 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 22 Apr 2026 23:36:19 +0000
Subject: [PATCH 7/8] Update recipes

---
 Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json | 3 ++-
 Qwen-Qwen3.5-2B/builtin/webgpu/text.json   | 5 ++++-
 Qwen-Qwen3.5-4B/builtin/webgpu/text.json   | 5 ++++-
 Qwen-Qwen3.5-9B/builtin/webgpu/text.json   | 3 ++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
index c8647228..de121e31 100644
--- a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
@@ -10,7 +10,8 @@
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx",
-                "prune_lm_head": true
+                "prune_lm_head": true,
+                "quant_mode": "default"
             }
         }
     },
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
index 3b23c40f..7e1accfc 100644
--- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
@@ -8,9 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
+            "int4_algo_config": "k_quant_mixed",
+            "int4_block_size": 128,
             "extra_options": {
                 "filename": "text.onnx",
-                "prune_lm_head": true
+                "prune_lm_head": true,
+                "quant_mode": "int4"
             }
         }
     },
diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
index 47aef6e3..6e79cba5 100644
--- a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json
@@ -8,9 +8,12 @@
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_accuracy_level": 4,
+            "int4_block_size": 32,
+            "int4_algo_config": "rtn",
             "extra_options": {
                 "filename": "text.onnx",
-                "prune_lm_head": true
+                "prune_lm_head": true,
+                "quant_mode": "int4"
             }
         }
     },
diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
index 38808e3f..9f8b9488 100644
--- a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json
@@ -10,7 +10,8 @@
             "int4_accuracy_level": 4,
             "extra_options": {
                 "filename": "text.onnx",
-                "prune_lm_head": true
+                "prune_lm_head": true,
+                "quant_mode": "int4"
             }
         }
     },

From 02903aa8c1810bbffa4beb2b7f33d5104d89729e Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 23 Apr 2026 22:07:04 +0000
Subject: [PATCH 8/8] Update block_size

---
 Qwen-Qwen3.5-2B/builtin/webgpu/text.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
index 7e1accfc..a7976f7d 100644
--- a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
+++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
@@ -9,7 +9,7 @@
             "precision": "int4",
             "int4_accuracy_level": 4,
             "int4_algo_config": "k_quant_mixed",
-            "int4_block_size": 128,
+            "int4_block_size": 32,
             "extra_options": {
                 "filename": "text.onnx",
                 "prune_lm_head": true,