microsoft · apsonawane · Apr 2, 2026 · Apr 2, 2026 · Apr 7, 2026 · Apr 8, 2026
diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py
@@ -46,13 +46,18 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
 
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
@@ -72,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)

diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
@@ -0,0 +1,45 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-0.8B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
@@ -0,0 +1,20 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-0.8B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true,
+                "quant_mode": "default"
+            }
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
@@ -0,0 +1,76 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-0.8B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py
@@ -46,13 +46,18 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
         vision_provider_options = [
             {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
         ]
+    elif device == "webgpu":
+        provider_options = [{"webgpu": {}}]
+        vision_provider_options = [{"webgpu": {}}]
     else:
         provider_options = []
         vision_provider_options = []
 
     session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
     vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}
 
+    config["model"]["decoder"]["session_options"] = session_options
+
     config["model"]["embedding"] = {
         "filename": "embedding.onnx",
         "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
@@ -72,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
     }
 
     config["model"]["bos_token_id"] = 248044
-    config["model"]["context_length"] = 4096
     config["model"]["eos_token_id"] = [248044]
     config["model"]["pad_token_id"] = 248044
     config["model"]["image_token_id"] = 248056
     config["model"]["video_token_id"] = 248057
     config["model"]["vision_start_token_id"] = 248053
 
-    config["search"]["max_length"] = 4096
     config["search"]["top_k"] = 1
     if config["search"].get("top_p") is None:
         config["search"]["top_p"] = 1.0
@@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):
 
 def main():
     parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
-    parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
+    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
     parser.add_argument("--config-dir", default="cpu_and_mobile")
     parser.add_argument("--skip-export", action="store_true")
     parser.add_argument("--models-dir", default=None)

diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
@@ -0,0 +1,45 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "model_loader": "get_embedding_model",
+        "model_script": "user_script.py",
+        "io_config": "get_embedding_io_config",
+        "dummy_inputs_func": "get_embedding_dummy_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": false
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "",
+            "opt_level": 1,
+            "only_onnxruntime": true
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "gemm2mm": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "GemmToMatMulAdd"
+                }
+            ]
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true,
+            "external_data_name": "embedding.onnx.data"
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/embedding.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json
@@ -0,0 +1,22 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "int4_algo_config": "k_quant_mixed",
+            "int4_block_size": 128,
+            "extra_options": {
+                "filename": "text.onnx",
+                "prune_lm_head": true,
+                "quant_mode": "int4"
+            }
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/text.onnx"
+}
diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
@@ -0,0 +1,76 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "model_loader": "get_vision_model",
+        "model_script": "user_script.py",
+        "io_config": "get_vision_io_config",
+        "dummy_inputs_func": "get_vision_dummy_inputs"
+    },
+    "passes": {
+        "c": {
+            "type": "OnnxConversion",
+            "use_dynamo_exporter": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "PackedAttentionToLoopMHA"
+                },
+                {
+                    "surgeon": "ReciprocalMulToDiv"
+                },
+                {
+                    "surgeon": "RenameOutputDims",
+                    "output_idx": 0,
+                    "dim_idx": 0,
+                    "dim_name": "num_logical_patches"
+                }
+            ]
+        },
+        "ort": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vit",
+            "opt_level": 2,
+            "only_onnxruntime": true
+        },
+        "dedup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateSubgraphInitializers"
+                }
+            ]
+        },
+        "cast": {
+            "type": "OnnxPeepholeOptimizer",
+            "onnxscript_optimize": false,
+            "onnxoptimizer_optimize": false,
+            "fuse_reshape_operations": false,
+            "fix_com_microsoft_opset": true,
+            "cast_chain_elimination": true
+        },
+        "fp16": {
+            "type": "OnnxFloatToFloat16",
+            "op_block_list": [
+                "LayerNormalization",
+                "Range"
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        },
+        "cleanup": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "DeduplicateNodes"
+                }
+            ],
+            "save_as_external_data": true,
+            "external_data_name": "vision.onnx.data"
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "webgpu/models/vision.onnx"
+}