diff --git a/Qwen-Qwen3.5-0.8B/builtin/optimize.py b/Qwen-Qwen3.5-0.8B/builtin/optimize.py index aab36840..d86fee7b 100644 --- a/Qwen-Qwen3.5-0.8B/builtin/optimize.py +++ b/Qwen-Qwen3.5-0.8B/builtin/optimize.py @@ -46,12 +46,17 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, @@ -142,7 +147,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..291aa464 --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json @@ -0,0 +1,45 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-0.8B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json new file mode 100644 index 00000000..de121e31 --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json @@ -0,0 +1,20 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-0.8B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true, + "quant_mode": "default" + } + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json new file mode 100644 index 00000000..6e6ef5fc --- /dev/null +++ b/Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json @@ -0,0 +1,76 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-0.8B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/optimize.py b/Qwen-Qwen3.5-2B/builtin/optimize.py index aab36840..d86fee7b 100644 --- a/Qwen-Qwen3.5-2B/builtin/optimize.py +++ b/Qwen-Qwen3.5-2B/builtin/optimize.py @@ -46,12 +46,17 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, @@ -142,7 +147,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..4490a781 --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json @@ -0,0 +1,45 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-2B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/text.json b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json new file mode 100644 index 00000000..a7976f7d --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/text.json @@ -0,0 +1,22 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-2B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_algo_config": "k_quant_mixed", + "int4_block_size": 32, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true, + "quant_mode": "int4" + } + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json new file mode 100644 index 00000000..bded6bf3 --- /dev/null +++ b/Qwen-Qwen3.5-2B/builtin/webgpu/vision.json @@ -0,0 +1,76 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-2B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/optimize.py b/Qwen-Qwen3.5-4B/builtin/optimize.py index aab36840..d86fee7b 100644 --- a/Qwen-Qwen3.5-4B/builtin/optimize.py +++ b/Qwen-Qwen3.5-4B/builtin/optimize.py @@ -46,12 +46,17 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, @@ -142,7 +147,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..97e64916 --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/embedding.json @@ -0,0 +1,45 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-4B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/text.json b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json new file mode 100644 index 00000000..6e79cba5 --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/text.json @@ -0,0 +1,22 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-4B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "int4_block_size": 32, + "int4_algo_config": "rtn", + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true, + "quant_mode": "int4" + } + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json new file mode 100644 index 00000000..1389d3da --- /dev/null +++ b/Qwen-Qwen3.5-4B/builtin/webgpu/vision.json @@ -0,0 +1,76 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-4B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/optimize.py b/Qwen-Qwen3.5-9B/builtin/optimize.py index aab36840..d86fee7b 100644 --- a/Qwen-Qwen3.5-9B/builtin/optimize.py +++ b/Qwen-Qwen3.5-9B/builtin/optimize.py @@ -46,12 +46,17 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): vision_provider_options = [ {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}} ] + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = [{"webgpu": {}}] else: provider_options = [] vision_provider_options = [] vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options} + config["model"]["decoder"]["session_options"] = session_options + config["model"]["embedding"] = { "filename": "embedding.onnx", "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, @@ -142,7 +147,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR): def main(): parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models") - parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") parser.add_argument("--config-dir", default="cpu_and_mobile") parser.add_argument("--skip-export", action="store_true") parser.add_argument("--models-dir", default=None) diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json new file mode 100644 index 00000000..0389dc30 --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/embedding.json @@ -0,0 +1,45 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-9B", + "model_loader": "get_embedding_model", + "model_script": "user_script.py", + "io_config": "get_embedding_io_config", + "dummy_inputs_func": "get_embedding_dummy_inputs" + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "use_dynamo_exporter": false + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "", + "opt_level": 1, + "only_onnxruntime": true + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "gemm2mm": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "GemmToMatMulAdd" + } + ] + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true, + "external_data_name": "embedding.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/embedding.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/text.json b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json new file mode 100644 index 00000000..9f8b9488 --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/text.json @@ -0,0 +1,20 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen3.5-9B" + }, + "passes": { + "m": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "text.onnx", + "prune_lm_head": true, + "quant_mode": "int4" + } + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/text.onnx" +} diff --git a/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json new file mode 100644 index 00000000..dfe0772f --- /dev/null +++ b/Qwen-Qwen3.5-9B/builtin/webgpu/vision.json @@ -0,0 +1,76 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "Qwen/Qwen3.5-9B", + "model_loader": "get_vision_model", + "model_script": "user_script.py", + "io_config": "get_vision_io_config", + "dummy_inputs_func": "get_vision_dummy_inputs" + }, + "passes": { + "c": { + "type": "OnnxConversion", + "use_dynamo_exporter": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "PackedAttentionToLoopMHA" + }, + { + "surgeon": "ReciprocalMulToDiv" + }, + { + "surgeon": "RenameOutputDims", + "output_idx": 0, + "dim_idx": 0, + "dim_name": "num_logical_patches" + } + ] + }, + "ort": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 2, + "only_onnxruntime": true + }, + "dedup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateSubgraphInitializers" + } + ] + }, + "cast": { + "type": "OnnxPeepholeOptimizer", + "onnxscript_optimize": false, + "onnxoptimizer_optimize": false, + "fuse_reshape_operations": false, + "fix_com_microsoft_opset": true, + "cast_chain_elimination": true + }, + "fp16": { + "type": "OnnxFloatToFloat16", + "op_block_list": [ + "LayerNormalization", + "Range" + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + }, + "cleanup": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "DeduplicateNodes" + } + ], + "save_as_external_data": true, + "external_data_name": "vision.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision.onnx" +}