From ef09658bfb0650094ef499e889fd195cde8d61c9 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Thu, 23 Apr 2026 11:19:30 +0000 Subject: [PATCH 1/7] fix: auto-upgrade model opset to 21 for int16/uint16 QDQ quantization The update_opset_version helper already auto-bumps opset to 19 when float8 quantization is requested on older models. Extend the same pattern to int16/uint16: when the user requests QUInt16 or QInt16 weight quantization and the model's opset is below 21, bump to 21 so that native ONNX QuantizeLinear/DequantizeLinear can be emitted instead of silently falling back to the com.microsoft contrib domain. Fixes #25223 --- .../python/tools/quantization/quant_utils.py | 9 +++++++ .../python/quantization/test_quant_util.py | 27 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 0ce1e1a0d75de..9b07b7bdf8e22 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -978,6 +978,15 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt ) target_opset_version = 19 + elif opset_version < 21 and weight_quant_type in (onnx.TensorProto.UINT16, onnx.TensorProto.INT16): + logging.warning( + f"The original model opset version is {opset_version}, which does not support 16-bit integer " + "quantization with native ONNX QuantizeLinear/DequantizeLinear. " + "Please update the model to opset >= 21. Automatically update the model to opset 21. " + "Please verify the quantized model." + ) + target_opset_version = 21 + elif opset_version == 10: logging.warning( f"The original model opset version is {opset_version}, which does not support node fusions. " diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index 468f97c980ad8..12f2afb221005 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -15,11 +15,13 @@ from onnx import TensorProto, helper, numpy_helper from onnxruntime.quantization.quant_utils import ( + QuantType, compute_scale_zp, load_model_with_shape_infer, model_has_infer_metadata, pack_bytes_to_4bit, quantize_data, + update_opset_version, ) @@ -173,6 +175,31 @@ def test_quantize_data_4bit(self): self.assertEqual(numpy.array(actual_quant_val), expected_quant_val) + def test_update_opset_version_16bit(self): + graph = helper.make_graph([], "test_graph", [], []) + + # 16-bit types should auto-bump opset < 21 -> 21 + for weight_type, label in ( + (QuantType.QUInt16, "QUInt16"), + (QuantType.QInt16, "QInt16"), + ): + with self.subTest(weight_type=label, opset=20): + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)]) + result = update_opset_version(model, weight_type) + result_opset = result.opset_import[0].version + self.assertEqual(result_opset, 21) + + # Already at opset 21 - should stay at 21 + for weight_type, label in ( + (QuantType.QUInt16, "QUInt16"), + (QuantType.QInt16, "QInt16"), + ): + with self.subTest(weight_type=label, opset=21): + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)]) + result = update_opset_version(model, weight_type) + result_opset = result.opset_import[0].version + self.assertEqual(result_opset, 21) + if __name__ == "__main__": unittest.main() From 52218ce7afee1b9e4f7bd360ce6aaf889042d1bb Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Fri, 1 May 2026 00:54:48 +0000 Subject: [PATCH 2/7] fix(quantization): bump opset for int16 activations too update_opset_version previously only inspected weight_type, so a config like activation_type=QInt16 with weight_type=QInt8 would not trigger the opset>=21 bump and could produce a model with int16 Q/DQ on opset<21. Extend the helper to accept activation_type and bump when either is INT16/UINT16. Update the quantize_static call site and add subtests covering 16-bit-activation-only, 16-bit-weight-only, both-8bit, and backward-compat (single-arg call) cases. --- .../python/tools/quantization/quant_utils.py | 12 +++++++++-- .../python/tools/quantization/quantize.py | 2 +- .../python/quantization/test_quant_util.py | 20 ++++++++++++++++++- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 9b07b7bdf8e22..73c9e322297cd 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -965,10 +965,18 @@ def get_opset_version(model: ModelProto) -> int: return opset_version -def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProto: +def update_opset_version( + model: ModelProto, weight_type: QuantType, activation_type: QuantType | None = None +) -> ModelProto: opset_version = get_opset_version(model) target_opset_version = opset_version weight_quant_type = getattr(weight_type, "tensor_type", weight_type) + activation_quant_type = ( + getattr(activation_type, "tensor_type", activation_type) if activation_type is not None else None + ) + + _int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16) + needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN: logging.warning( @@ -978,7 +986,7 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt ) target_opset_version = 19 - elif opset_version < 21 and weight_quant_type in (onnx.TensorProto.UINT16, onnx.TensorProto.INT16): + elif opset_version < 21 and needs_opset21_for_16bit: logging.warning( f"The original model opset version is {opset_version}, which does not support 16-bit integer " "quantization with native ONNX QuantizeLinear/DequantizeLinear. " diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index b8b239b85e7ad..a3aab06c5a935 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -699,7 +699,7 @@ def inc_dataloader(): nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes]) model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration - updated_model = update_opset_version(model, weight_type) + updated_model = update_opset_version(model, weight_type, activation_type) is_model_updated = updated_model is not model if is_model_updated: model = updated_model diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index 12f2afb221005..d17a9f8eaf457 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -178,7 +178,7 @@ def test_quantize_data_4bit(self): def test_update_opset_version_16bit(self): graph = helper.make_graph([], "test_graph", [], []) - # 16-bit types should auto-bump opset < 21 -> 21 + # 16-bit weight type alone should auto-bump opset < 21 -> 21 for weight_type, label in ( (QuantType.QUInt16, "QUInt16"), (QuantType.QInt16, "QInt16"), @@ -200,6 +200,24 @@ def test_update_opset_version_16bit(self): result_opset = result.opset_import[0].version self.assertEqual(result_opset, 21) + # 16-bit activation type with 8-bit weight should also bump opset < 21 -> 21 + for activation_type, label in ( + (QuantType.QUInt16, "QUInt16"), + (QuantType.QInt16, "QInt16"), + ): + with self.subTest(activation_type=label, weight_type="QInt8", opset=20): + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)]) + result = update_opset_version(model, QuantType.QInt8, activation_type) + result_opset = result.opset_import[0].version + self.assertEqual(result_opset, 21) + + # Both 8-bit should NOT bump to 21 + with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20): + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)]) + result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8) + result_opset = result.opset_import[0].version + self.assertNotEqual(result_opset, 21) + if __name__ == "__main__": unittest.main() From ef62c23adc3da9c5ba528ff858c54a88d7e9daf7 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Sun, 3 May 2026 11:51:51 +0000 Subject: [PATCH 3/7] fix(quantization): do not pre-set UseQDQContribOps for int16 types in get_qdq_config get_qdq_config() was auto-setting extra_options["UseQDQContribOps"] = True whenever activation_type or weight_type was INT16/UINT16 and the model opset was < 21. This caused the config-based quantize(..., StaticQuantConfig) path to emit com.microsoft Q/DQ ops even after quantize_static() bumped the model to opset 21, where native ONNX QuantizeLinear/DequantizeLinear supports INT16/UINT16 natively. Narrow the condition so that UseQDQContribOps is only auto-set for 4-bit types (which have no opset bump) and for tensor-override-based types; 16-bit top-level weight/activation types are excluded because the opset-21 bump in quantize_static() already handles them. An explicit user-supplied UseQDQContribOps in extra_options still takes precedence via the existing override merge. Update test_get_qdq_config.py: rename and fix the int16-opset19 subtest to assert the new correct behavior (no contrib-ops flag), and add an end-to-end test that verifies the config path produces an opset-21 model with native-domain Q/DQ nodes. Tighten the existing no-op subtest in test_quant_util.py from assertNotEqual to assertEqual(result_opset, 20) for a stricter regression guard. --- .../python/tools/quantization/quantize.py | 10 +++- .../quantization/test_get_qdq_config.py | 56 +++++++++++++++++-- .../python/quantization/test_quant_util.py | 4 +- 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index a3aab06c5a935..45c8a3005bceb 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -369,13 +369,17 @@ def get_qdq_config( } final_extra_options.update(calib_extra_options) - # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain - # on Q/DQ operators if using 16-bit or 4-bit quantization. + # ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain + # on Q/DQ operators if using 4-bit quantization. 16-bit weight/activation types are excluded here + # because quantize_static() will automatically bump the model opset to 21, where native ONNX + # QuantizeLinear/DequantizeLinear supports INT16/UINT16 without contrib-domain ops. onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx") if onnx_opset.version < 21: opset21_types = q16_types.union(q4_types) overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types()) - if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types: + # Only set UseQDQContribOps for 4-bit types; 16-bit types are handled by the opset bump. + needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_opset21_types + if needs_contrib_ops: final_extra_options["UseQDQContribOps"] = True # Allow user's extra_options to override our final_extra_options. diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py index 4a71b3694822c..317b9c4b153a1 100644 --- a/onnxruntime/test/python/quantization/test_get_qdq_config.py +++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py @@ -271,10 +271,12 @@ def test_external_data(self): self.assertIsNotNone(weight_quantized) self.assertEqual(weight_quantized.data_location, onnx.TensorProto.EXTERNAL) - def test_use_qdq_contrib_ops_for_int16_opset19(self): + def test_no_qdq_contrib_ops_for_int16_opset_lt21(self): """ - Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for - use of int16 in opset < 21. + Test that get_qdq_config() does NOT set UseQDQContribOps for int16 types even when + the model opset is < 21. quantize_static() will bump the opset to 21 automatically, + where native ONNX QuantizeLinear/DequantizeLinear supports INT16/UINT16, so contrib- + domain ops are not needed. """ shape = [1, 8, 8] @@ -297,7 +299,53 @@ def test_use_qdq_contrib_ops_for_int16_opset19(self): ) self.assertEqual(qdq_config.activation_type, QuantType.QUInt16) - self.assertTrue(qdq_config.extra_options["UseQDQContribOps"]) + # UseQDQContribOps must NOT be auto-set for 16-bit types; the opset bump handles them. + self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False)) + + def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self): + """ + Test that the config-based quantize() path produces a model at opset 21 using native + ONNX QuantizeLinear/DequantizeLinear (not com.microsoft domain) when int16 activation + types are requested on a model whose original opset is < 21. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + # Build a model at opset 20 (< 21) with int16 activation type + float_model = self.build_add_model(shape, tensor_type, weight, opset=20) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + qdq_config = get_qdq_config( + float_model, + data_reader, + activation_type=QuantType.QUInt16, + weight_type=QuantType.QInt8, + ) + + qdq_model_path = os.path.join(self._tmp_dir_path, "add_int16_opset20_qdq.onnx") + quantize(float_model, qdq_model_path, qdq_config) + + qdq_model = onnx.load_model(qdq_model_path) + + # The quantized model must have been bumped to opset 21. + onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx") + self.assertEqual(onnx_opset.version, 21) + + # All Q/DQ nodes must use the default ONNX domain (not com.microsoft). + for node in qdq_model.graph.node: + if node.op_type in ("QuantizeLinear", "DequantizeLinear"): + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) def test_use_qdq_contrib_ops_for_int4_opset19(self): """ diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index d17a9f8eaf457..16645c3b8a5d7 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -211,12 +211,12 @@ def test_update_opset_version_16bit(self): result_opset = result.opset_import[0].version self.assertEqual(result_opset, 21) - # Both 8-bit should NOT bump to 21 + # Both 8-bit should NOT bump to 21; opset stays at 20 with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20): model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)]) result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8) result_opset = result.opset_import[0].version - self.assertNotEqual(result_opset, 21) + self.assertEqual(result_opset, 20) if __name__ == "__main__": From 1d4160a7409d64595654f55fee13625ec1525e16 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Mon, 4 May 2026 11:50:22 +0000 Subject: [PATCH 4/7] fix(quantization): scan TensorQuantOverrides for 16-bit and recompute UseQDQContribOps after opset bump - Extend opset-21 bump helper to inspect TensorQuantOverrides (including per-tensor convert.quant_type) for QInt16/QUInt16, so models with default 8-bit base types but 16-bit overrides also get the native opset-21 path. - Generalize the opset-bump warning text so it is accurate for both QDQ static and quantize_dynamic flows. - Recompute UseQDQContribOps after the opset bump so 16-bit/4-bit overrides no longer latch the model to com.microsoft Q/DQ post-bump. - Add regression tests for opset<21 + 16-bit overrides and mixed 16-bit/4-bit overrides via TensorQuantOverrides. --- .../python/tools/quantization/quant_utils.py | 25 +++++- .../python/tools/quantization/quantize.py | 27 ++++-- .../quantization/test_get_qdq_config.py | 88 +++++++++++++++++++ 3 files changed, 130 insertions(+), 10 deletions(-) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 73c9e322297cd..3bc39ed0890b8 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -966,7 +966,10 @@ def get_opset_version(model: ModelProto) -> int: def update_opset_version( - model: ModelProto, weight_type: QuantType, activation_type: QuantType | None = None + model: ModelProto, + weight_type: QuantType, + activation_type: QuantType | None = None, + tensor_quant_overrides: dict | None = None, ) -> ModelProto: opset_version = get_opset_version(model) target_opset_version = opset_version @@ -978,6 +981,24 @@ def update_opset_version( _int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16) needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types + # Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type. + if not needs_opset21_for_16bit and tensor_quant_overrides: + _int16_quant_types = {QuantType.QInt16, QuantType.QUInt16} + for overrides_list in tensor_quant_overrides.values(): + for override in overrides_list: + qt = override.get("quant_type") + if qt in _int16_quant_types: + needs_opset21_for_16bit = True + break + convert = override.get("convert") + if convert is not None: + convert_qt = convert.get("quant_type") + if convert_qt in _int16_quant_types: + needs_opset21_for_16bit = True + break + if needs_opset21_for_16bit: + break + if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN: logging.warning( f"The original model opset version is {opset_version}, which does not support quantization to float 8. " @@ -989,7 +1010,7 @@ def update_opset_version( elif opset_version < 21 and needs_opset21_for_16bit: logging.warning( f"The original model opset version is {opset_version}, which does not support 16-bit integer " - "quantization with native ONNX QuantizeLinear/DequantizeLinear. " + "quantization natively. " "Please update the model to opset >= 21. Automatically update the model to opset 21. " "Please verify the quantized model." ) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 45c8a3005bceb..010836900feea 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -372,15 +372,21 @@ def get_qdq_config( # ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain # on Q/DQ operators if using 4-bit quantization. 16-bit weight/activation types are excluded here # because quantize_static() will automatically bump the model opset to 21, where native ONNX - # QuantizeLinear/DequantizeLinear supports INT16/UINT16 without contrib-domain ops. + # QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops. + # 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit + # override config will be served at opset 21 where neither type needs contrib ops. onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx") if onnx_opset.version < 21: - opset21_types = q16_types.union(q4_types) - overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types()) - # Only set UseQDQContribOps for 4-bit types; 16-bit types are handled by the opset bump. - needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_opset21_types - if needs_contrib_ops: - final_extra_options["UseQDQContribOps"] = True + override_types = overrides_helper.get_quant_types() + overrides_have_16bit = any(t in q16_types for t in override_types) + # If any 16-bit type is present (top-level or override), quantize_static() will bump the + # model to opset 21, making contrib ops unnecessary for all types. + will_bump_to_opset21 = activation_type in q16_types or weight_type in q16_types or overrides_have_16bit + if not will_bump_to_opset21: + overrides_have_q4_types = any(t in q4_types for t in override_types) + needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_q4_types + if needs_contrib_ops: + final_extra_options["UseQDQContribOps"] = True # Allow user's extra_options to override our final_extra_options. if extra_options: @@ -703,7 +709,12 @@ def inc_dataloader(): nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes]) model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration - updated_model = update_opset_version(model, weight_type, activation_type) + updated_model = update_opset_version( + model, + weight_type, + activation_type, + tensor_quant_overrides=extra_options.get("TensorQuantOverrides"), + ) is_model_updated = updated_model is not model if is_model_updated: model = updated_model diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py index 317b9c4b153a1..9d19e2da53d17 100644 --- a/onnxruntime/test/python/quantization/test_get_qdq_config.py +++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py @@ -377,6 +377,94 @@ def test_use_qdq_contrib_ops_for_int4_opset19(self): self.assertEqual(qdq_config.extra_options["TensorQuantOverrides"]["weight"][0]["quant_type"], QuantType.QInt4) self.assertTrue(qdq_config.extra_options["UseQDQContribOps"]) + def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self): + """ + Regression test: when TensorQuantOverrides request a 16-bit type on a model whose opset is + < 21, the quantized model must be bumped to opset 21 and UseQDQContribOps must NOT be set. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + # Build a model at opset 18 (< 21) so the opset bump is required. + float_model = self.build_add_model(shape, tensor_type, weight, opset=18) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # Override the weight to use QUInt16 via TensorQuantOverrides; top-level types are 8-bit. + qdq_config = get_qdq_config( + float_model, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + tensor_quant_overrides={"weight": [{"quant_type": QuantType.QUInt16}]}, + ) + + # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21, + # where native ONNX Q/DQ ops handle all types. + self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False)) + + qdq_model_path = os.path.join(self._tmp_dir_path, "add_override_uint16_opset18_qdq.onnx") + quantize(float_model, qdq_model_path, qdq_config) + + qdq_model = onnx.load_model(qdq_model_path) + + # The quantized model must have been bumped to opset 21. + onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx") + self.assertEqual(onnx_opset.version, 21) + + # All Q/DQ nodes must use the default ONNX domain (not com.microsoft). + for node in qdq_model.graph.node: + if node.op_type in ("QuantizeLinear", "DequantizeLinear"): + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) + + def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self): + """ + Regression test: when TensorQuantOverrides contain both a 16-bit type (for one tensor) and + a 4-bit type (for another tensor) on a model whose opset is < 21, UseQDQContribOps must NOT + be set because the 16-bit override triggers an opset bump to 21 where all types are native. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + # Build a model at opset 18 (< 21). + float_model = self.build_add_model(shape, tensor_type, weight, opset=18) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # Override: weight uses QUInt16 (16-bit, triggers opset bump), input_0 uses QInt4 (4-bit). + # The presence of the 16-bit override means the model is bumped to opset 21, so native + # Q/DQ ops handle everything — UseQDQContribOps must NOT be set. + qdq_config = get_qdq_config( + float_model, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + tensor_quant_overrides={ + "weight": [{"quant_type": QuantType.QUInt16}], + "input_0": [{"quant_type": QuantType.QInt4}], + }, + ) + + # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21, + # making native Q/DQ ops sufficient for all types including the 4-bit one. + self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False)) + if __name__ == "__main__": unittest.main() From d1ae1ab1a8cbd3042c5ce17bcd32e56661820f42 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Tue, 5 May 2026 12:01:05 +0000 Subject: [PATCH 5/7] fix: harden override scan and update tests for opset-21 bump Address review feedback on the int16/uint16 QDQ opset auto-bump: - Wrap the TensorQuantOverrides scan loop in a try/except for (AttributeError, TypeError) so malformed input falls through to the existing TensorQuantOverridesHelper.is_valid() ValueError instead of raising an unrelated AttributeError on .get() calls. - Rename test_16bit_overrides_set_ms_domain to test_16bit_overrides_bump_opset_to_21 and flip its assertions to match the new behavior (opset bumped to 21, native ai.onnx Q/DQ). - Add test_16bit_convert_quant_type_bumps_opset_to_21 covering the convert.quant_type branch with an opset-20 model, ensuring the bump fires for the convert sub-dict path as well as top-level overrides. --- .../python/tools/quantization/quant_utils.py | 28 +++++---- .../test_tensor_quant_overrides_option.py | 62 ++++++++++++++++--- 2 files changed, 71 insertions(+), 19 deletions(-) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 3bc39ed0890b8..cbf0605564172 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -982,22 +982,26 @@ def update_opset_version( needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types # Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type. + # Validation of structure is deferred to TensorQuantOverridesHelper.is_valid(); skip bump heuristic on malformed input. if not needs_opset21_for_16bit and tensor_quant_overrides: _int16_quant_types = {QuantType.QInt16, QuantType.QUInt16} - for overrides_list in tensor_quant_overrides.values(): - for override in overrides_list: - qt = override.get("quant_type") - if qt in _int16_quant_types: - needs_opset21_for_16bit = True - break - convert = override.get("convert") - if convert is not None: - convert_qt = convert.get("quant_type") - if convert_qt in _int16_quant_types: + try: + for overrides_list in tensor_quant_overrides.values(): + for override in overrides_list: + qt = override.get("quant_type") + if qt in _int16_quant_types: needs_opset21_for_16bit = True break - if needs_opset21_for_16bit: - break + convert = override.get("convert") + if convert is not None: + convert_qt = convert.get("quant_type") + if convert_qt in _int16_quant_types: + needs_opset21_for_16bit = True + break + if needs_opset21_for_16bit: + break + except (AttributeError, TypeError): + pass if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN: logging.warning( diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 520f589187585..6a402cd7ae96a 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -436,11 +436,13 @@ def test_qdq_overrides_per_channel2(self): self.assertEqual(zp, expected_zp) self.assertEqual(scale, np.float32(expected_scale)) - def test_16bit_overrides_set_ms_domain(self): + def test_16bit_overrides_bump_opset_to_21(self): """ - Test that overriding a tensor to 16bit (when default is 8bit) automatically - sets the 'com.microsoft' domain on DQ and Q ops for opset < 21. - Before ONNX 1.16.0, we had to use the 'com.microsoft' domain to be able to use 16-bit quantization. + Test that overriding a tensor to 16-bit (when default is 8-bit) automatically bumps the model + opset to 21 and emits native ai.onnx Q/DQ ops (not 'com.microsoft' domain ops). + + Previously (before the opset-bump heuristic), a sub-opset-21 model with INT16 overrides would + use the 'com.microsoft' domain. Now the model is auto-upgraded so the standard domain is used. """ qdq_model_name = "model_quant_overrides_to_16bit.onnx" inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization( @@ -459,14 +461,20 @@ def test_16bit_overrides_set_ms_domain(self): self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16) self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16) - # Output should the default uint8 type + # Output should be the default uint8 type self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8) - # Q/DQ ops should all have the 'com.microsoft' domain + # The model opset should have been auto-bumped to >= 21 qdq_model = onnx.load_model(qdq_model_name) + ai_onnx_opset = next( + opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx" + ) + self.assertGreaterEqual(ai_onnx_opset, 21) + + # Q/DQ ops should be in the default domain (NOT 'com.microsoft') for node in qdq_model.graph.node: if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: - self.assertEqual(node.domain, ms_domain) + self.assertNotEqual(node.domain, ms_domain) def test_16bit_overrides_not_set_ms_domain(self): """ @@ -500,6 +508,46 @@ def test_16bit_overrides_not_set_ms_domain(self): if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: self.assertNotEqual(node.domain, ms_domain) + def test_16bit_convert_quant_type_bumps_opset_to_21(self): + """ + Regression test: a 16-bit type specified via the 'convert.quant_type' field inside + TensorQuantOverrides should also trigger the opset-21 auto-bump, even when the top-level + quant_type for that tensor is 8-bit. + + Verifies that the resulting model has ai.onnx opset >= 21 and that QuantizeLinear / + DequantizeLinear nodes are in the default domain (not 'com.microsoft'). + """ + qdq_model_name = "model_quant_overrides_convert_16bit.onnx" + inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization( + qdq_model_name, + activation_type=onnx.TensorProto.UINT8, # Default to 8bit activations + extra_options={ + "TensorQuantOverrides": { + # quant_type is 8-bit; the 16-bit is only in the convert sub-dict + "INP": [{"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QInt16}}], + } + }, + opset=20, + ) + + # INP primary quant type stays uint8 + self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT8) + + # Output should be the default uint8 type + self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8) + + # The model opset should have been auto-bumped to >= 21 due to convert.quant_type = QInt16 + qdq_model = onnx.load_model(qdq_model_name) + ai_onnx_opset = next( + opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx" + ) + self.assertGreaterEqual(ai_onnx_opset, 21) + + # Q/DQ ops should be in the default domain (NOT 'com.microsoft') + for node in qdq_model.graph.node: + if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: + self.assertNotEqual(node.domain, ms_domain) + def test_override_validation_nonexisting_tensor(self): """ Test that specifying a non-existing tensor should fail. From 140d63120cb04012e6092b0dd80b4b7a49b236b6 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Mon, 11 May 2026 11:26:39 +0000 Subject: [PATCH 6/7] Log debug message instead of silent pass on malformed overrides Replaces an empty except block in the 16-bit opset bump heuristic with a logging.debug call so callers can observe when a structurally malformed TensorQuantOverrides causes the heuristic to be skipped. Addresses CodeQL 'Empty except' finding and review feedback on PR #28202. --- onnxruntime/python/tools/quantization/quant_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index cbf0605564172..81a6aaa892280 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -1001,7 +1001,9 @@ def update_opset_version( if needs_opset21_for_16bit: break except (AttributeError, TypeError): - pass + # Malformed overrides; structural validation is deferred to + # TensorQuantOverridesHelper.is_valid(). Skip bump heuristic. + logging.debug("Skipping 16-bit opset bump heuristic for TensorQuantOverrides: structure not as expected.") if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN: logging.warning( From da1271c8cefcb6b867387193445d45585e1b180a Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Tue, 12 May 2026 11:11:08 +0000 Subject: [PATCH 7/7] fix(quant): guard extra_options None and strengthen 16-bit opset tests Address review feedback on 16-bit QDQ opset bump: - Guard extra_options against None in quantize() call path - Use get_opset_version() helper for clearer test failures - Assert default ai.onnx domain for Q/DQ nodes - Extend get_qdq_config test to invoke quantize end-to-end and verify output opset==21 with default-domain Q/DQ ops --- .../python/tools/quantization/quantize.py | 7 +++-- .../quantization/test_get_qdq_config.py | 27 ++++++++++++++--- .../test_tensor_quant_overrides_option.py | 30 ++++++++++++------- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 010836900feea..7a887874b6e74 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -22,6 +22,7 @@ QuantFormat, QuantizationMode, QuantType, + get_opset_version, load_model_with_shape_infer, model_has_pre_process_metadata, save_and_reload_model_with_shape_infer, @@ -375,8 +376,8 @@ def get_qdq_config( # QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops. # 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit # override config will be served at opset 21 where neither type needs contrib ops. - onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx") - if onnx_opset.version < 21: + onnx_opset_version = get_opset_version(model) + if onnx_opset_version < 21: override_types = overrides_helper.get_quant_types() overrides_have_16bit = any(t in q16_types for t in override_types) # If any 16-bit type is present (top-level or override), quantize_static() will bump the @@ -713,7 +714,7 @@ def inc_dataloader(): model, weight_type, activation_type, - tensor_quant_overrides=extra_options.get("TensorQuantOverrides"), + tensor_quant_overrides=(extra_options or {}).get("TensorQuantOverrides"), ) is_model_updated = updated_model is not model if is_model_updated: diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py index 9d19e2da53d17..7a0251ac06c0c 100644 --- a/onnxruntime/test/python/quantization/test_get_qdq_config.py +++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py @@ -15,6 +15,7 @@ from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, get_qdq_config, quantize +from onnxruntime.quantization.quant_utils import get_opset_version class TestGetQDQConfig(unittest.TestCase): @@ -335,8 +336,8 @@ def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self): qdq_model = onnx.load_model(qdq_model_path) # The quantized model must have been bumped to opset 21. - onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx") - self.assertEqual(onnx_opset.version, 21) + onnx_opset_version = get_opset_version(qdq_model) + self.assertEqual(onnx_opset_version, 21) # All Q/DQ nodes must use the default ONNX domain (not com.microsoft). for node in qdq_model.graph.node: @@ -415,8 +416,8 @@ def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self): qdq_model = onnx.load_model(qdq_model_path) # The quantized model must have been bumped to opset 21. - onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx") - self.assertEqual(onnx_opset.version, 21) + onnx_opset_version = get_opset_version(qdq_model) + self.assertEqual(onnx_opset_version, 21) # All Q/DQ nodes must use the default ONNX domain (not com.microsoft). for node in qdq_model.graph.node: @@ -465,6 +466,24 @@ def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self): # making native Q/DQ ops sufficient for all types including the 4-bit one. self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False)) + qdq_model_path = os.path.join(self._tmp_dir_path, "add_mixed_16bit_4bit_opset18_qdq.onnx") + quantize(float_model, qdq_model_path, qdq_config) + + qdq_model = onnx.load_model(qdq_model_path) + + # The quantized model must have been bumped to opset 21. + onnx_opset_version = get_opset_version(qdq_model) + self.assertGreaterEqual(onnx_opset_version, 21) + + # All Q/DQ nodes must use the default ONNX domain (not com.microsoft). + for node in qdq_model.graph.node: + if node.op_type in ("QuantizeLinear", "DequantizeLinear"): + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 6a402cd7ae96a..27500679b496a 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -15,7 +15,7 @@ from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType, quantize_static from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config -from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType, ms_domain +from onnxruntime.quantization.quant_utils import compute_scale_zp, get_opset_version, get_qmin_qmax_for_qType class DummyDataReader(CalibrationDataReader): @@ -466,15 +466,17 @@ def test_16bit_overrides_bump_opset_to_21(self): # The model opset should have been auto-bumped to >= 21 qdq_model = onnx.load_model(qdq_model_name) - ai_onnx_opset = next( - opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx" - ) + ai_onnx_opset = get_opset_version(qdq_model) self.assertGreaterEqual(ai_onnx_opset, 21) # Q/DQ ops should be in the default domain (NOT 'com.microsoft') for node in qdq_model.graph.node: if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: - self.assertNotEqual(node.domain, ms_domain) + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) def test_16bit_overrides_not_set_ms_domain(self): """ @@ -502,11 +504,15 @@ def test_16bit_overrides_not_set_ms_domain(self): # Output should the default uint8 type self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8) - # Q/DQ ops should all have the 'com.microsoft' domain + # Q/DQ ops should be in the default domain (NOT 'com.microsoft') qdq_model = onnx.load_model(qdq_model_name) for node in qdq_model.graph.node: if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: - self.assertNotEqual(node.domain, ms_domain) + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) def test_16bit_convert_quant_type_bumps_opset_to_21(self): """ @@ -538,15 +544,17 @@ def test_16bit_convert_quant_type_bumps_opset_to_21(self): # The model opset should have been auto-bumped to >= 21 due to convert.quant_type = QInt16 qdq_model = onnx.load_model(qdq_model_name) - ai_onnx_opset = next( - opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx" - ) + ai_onnx_opset = get_opset_version(qdq_model) self.assertGreaterEqual(ai_onnx_opset, 21) # Q/DQ ops should be in the default domain (NOT 'com.microsoft') for node in qdq_model.graph.node: if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: - self.assertNotEqual(node.domain, ms_domain) + self.assertEqual( + node.domain, + "", + f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'", + ) def test_override_validation_nonexisting_tensor(self): """