Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions Qwen-Qwen3.5-0.8B/builtin/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,18 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
vision_provider_options = [
{"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
]
elif device == "webgpu":
provider_options = [{"webgpu": {}}]
vision_provider_options = [{"webgpu": {}}]
Comment thread
apsonawane marked this conversation as resolved.
else:
provider_options = []
vision_provider_options = []

session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}

config["model"]["decoder"]["session_options"] = session_options

config["model"]["embedding"] = {
"filename": "embedding.onnx",
"inputs": {"input_ids": "input_ids", "image_features": "image_features"},
Expand All @@ -72,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
}

config["model"]["bos_token_id"] = 248044
config["model"]["context_length"] = 4096
config["model"]["eos_token_id"] = [248044]
config["model"]["pad_token_id"] = 248044
config["model"]["image_token_id"] = 248056
config["model"]["video_token_id"] = 248057
config["model"]["vision_start_token_id"] = 248053

config["search"]["max_length"] = 4096
config["search"]["top_k"] = 1
if config["search"].get("top_p") is None:
config["search"]["top_p"] = 1.0
Expand Down Expand Up @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):

def main():
parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
parser.add_argument("--config-dir", default="cpu_and_mobile")
parser.add_argument("--skip-export", action="store_true")
parser.add_argument("--models-dir", default=None)
Expand Down
45 changes: 45 additions & 0 deletions Qwen-Qwen3.5-0.8B/builtin/webgpu/embedding.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"input_model": {
"type": "PyTorchModel",
"model_path": "Qwen/Qwen3.5-0.8B",
"model_loader": "get_embedding_model",
"model_script": "user_script.py",
"io_config": "get_embedding_io_config",
"dummy_inputs_func": "get_embedding_dummy_inputs"
},
"passes": {
"convert": {
"type": "OnnxConversion",
"use_dynamo_exporter": false
},
"ort": {
"type": "OrtTransformersOptimization",
"model_type": "",
"opt_level": 1,
"only_onnxruntime": true
},
"cast": {
"type": "OnnxPeepholeOptimizer",
"onnxscript_optimize": false,
"onnxoptimizer_optimize": false,
"fuse_reshape_operations": false,
"fix_com_microsoft_opset": true,
"cast_chain_elimination": true
},
"gemm2mm": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "GemmToMatMulAdd"
}
]
},
"fp16": {
"type": "OnnxFloatToFloat16",
"save_as_external_data": true,
"external_data_name": "embedding.onnx.data"
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/embedding.onnx"
}
20 changes: 20 additions & 0 deletions Qwen-Qwen3.5-0.8B/builtin/webgpu/text.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-0.8B"
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"int4_accuracy_level": 4,
"extra_options": {
"filename": "text.onnx",
"prune_lm_head": true,
"quant_mode": "default"
}
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/text.onnx"
}
76 changes: 76 additions & 0 deletions Qwen-Qwen3.5-0.8B/builtin/webgpu/vision.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"input_model": {
"type": "PyTorchModel",
"model_path": "Qwen/Qwen3.5-0.8B",
"model_loader": "get_vision_model",
"model_script": "user_script.py",
"io_config": "get_vision_io_config",
"dummy_inputs_func": "get_vision_dummy_inputs"
},
"passes": {
"c": {
"type": "OnnxConversion",
"use_dynamo_exporter": true
},
"gs": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "PackedAttentionToLoopMHA"
},
{
"surgeon": "ReciprocalMulToDiv"
},
{
"surgeon": "RenameOutputDims",
"output_idx": 0,
"dim_idx": 0,
"dim_name": "num_logical_patches"
}
]
},
"ort": {
"type": "OrtTransformersOptimization",
"model_type": "vit",
"opt_level": 2,
"only_onnxruntime": true
},
"dedup": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "DeduplicateSubgraphInitializers"
}
]
},
"cast": {
"type": "OnnxPeepholeOptimizer",
"onnxscript_optimize": false,
"onnxoptimizer_optimize": false,
"fuse_reshape_operations": false,
"fix_com_microsoft_opset": true,
"cast_chain_elimination": true
},
"fp16": {
"type": "OnnxFloatToFloat16",
"op_block_list": [
"LayerNormalization",
"Range"
],
"save_as_external_data": true,
"external_data_name": "vision.onnx.data"
},
"cleanup": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "DeduplicateNodes"
}
],
"save_as_external_data": true,
"external_data_name": "vision.onnx.data"
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/vision.onnx"
}
9 changes: 6 additions & 3 deletions Qwen-Qwen3.5-2B/builtin/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,18 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
vision_provider_options = [
{"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
]
elif device == "webgpu":
provider_options = [{"webgpu": {}}]
vision_provider_options = [{"webgpu": {}}]
Comment thread
apsonawane marked this conversation as resolved.
else:
provider_options = []
vision_provider_options = []

session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}

config["model"]["decoder"]["session_options"] = session_options

config["model"]["embedding"] = {
"filename": "embedding.onnx",
"inputs": {"input_ids": "input_ids", "image_features": "image_features"},
Expand All @@ -72,14 +77,12 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
}

config["model"]["bos_token_id"] = 248044
config["model"]["context_length"] = 4096
config["model"]["eos_token_id"] = [248044]
config["model"]["pad_token_id"] = 248044
config["model"]["image_token_id"] = 248056
config["model"]["video_token_id"] = 248057
config["model"]["vision_start_token_id"] = 248053

config["search"]["max_length"] = 4096
config["search"]["top_k"] = 1
if config["search"].get("top_p") is None:
config["search"]["top_p"] = 1.0
Expand Down Expand Up @@ -145,7 +148,7 @@ def fix_tokenizer(output_dir: str = MODELS_DIR):

def main():
parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu")
parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
parser.add_argument("--config-dir", default="cpu_and_mobile")
parser.add_argument("--skip-export", action="store_true")
parser.add_argument("--models-dir", default=None)
Expand Down
45 changes: 45 additions & 0 deletions Qwen-Qwen3.5-2B/builtin/webgpu/embedding.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"input_model": {
"type": "PyTorchModel",
"model_path": "Qwen/Qwen3.5-2B",
"model_loader": "get_embedding_model",
"model_script": "user_script.py",
"io_config": "get_embedding_io_config",
"dummy_inputs_func": "get_embedding_dummy_inputs"
},
"passes": {
"convert": {
"type": "OnnxConversion",
"use_dynamo_exporter": false
},
"ort": {
"type": "OrtTransformersOptimization",
"model_type": "",
"opt_level": 1,
"only_onnxruntime": true
},
"cast": {
"type": "OnnxPeepholeOptimizer",
"onnxscript_optimize": false,
"onnxoptimizer_optimize": false,
"fuse_reshape_operations": false,
"fix_com_microsoft_opset": true,
"cast_chain_elimination": true
},
"gemm2mm": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "GemmToMatMulAdd"
}
]
},
"fp16": {
"type": "OnnxFloatToFloat16",
"save_as_external_data": true,
"external_data_name": "embedding.onnx.data"
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/embedding.onnx"
}
22 changes: 22 additions & 0 deletions Qwen-Qwen3.5-2B/builtin/webgpu/text.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B"
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"int4_accuracy_level": 4,
"int4_algo_config": "k_quant_mixed",
"int4_block_size": 128,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

webgpu needs bs=32

"extra_options": {
"filename": "text.onnx",
"prune_lm_head": true,
"quant_mode": "int4"
}
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/text.onnx"
}
76 changes: 76 additions & 0 deletions Qwen-Qwen3.5-2B/builtin/webgpu/vision.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"input_model": {
"type": "PyTorchModel",
"model_path": "Qwen/Qwen3.5-2B",
"model_loader": "get_vision_model",
"model_script": "user_script.py",
"io_config": "get_vision_io_config",
"dummy_inputs_func": "get_vision_dummy_inputs"
},
"passes": {
"c": {
"type": "OnnxConversion",
"use_dynamo_exporter": true
},
"gs": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "PackedAttentionToLoopMHA"
},
{
"surgeon": "ReciprocalMulToDiv"
},
{
"surgeon": "RenameOutputDims",
"output_idx": 0,
"dim_idx": 0,
"dim_name": "num_logical_patches"
}
]
},
"ort": {
"type": "OrtTransformersOptimization",
"model_type": "vit",
"opt_level": 2,
"only_onnxruntime": true
},
"dedup": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "DeduplicateSubgraphInitializers"
}
]
},
"cast": {
"type": "OnnxPeepholeOptimizer",
"onnxscript_optimize": false,
"onnxoptimizer_optimize": false,
"fuse_reshape_operations": false,
"fix_com_microsoft_opset": true,
"cast_chain_elimination": true
},
"fp16": {
"type": "OnnxFloatToFloat16",
"op_block_list": [
"LayerNormalization",
"Range"
],
"save_as_external_data": true,
"external_data_name": "vision.onnx.data"
},
"cleanup": {
"type": "GraphSurgeries",
"surgeries": [
{
"surgeon": "DeduplicateNodes"
}
],
"save_as_external_data": true,
"external_data_name": "vision.onnx.data"
}
},
"no_artifacts": true,
"output_dir": "webgpu/models/vision.onnx"
}
Loading
Loading