From ef09658bfb0650094ef499e889fd195cde8d61c9 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Thu, 23 Apr 2026 11:19:30 +0000
Subject: [PATCH 1/7] fix: auto-upgrade model opset to 21 for int16/uint16 QDQ
 quantization

The update_opset_version helper already auto-bumps opset to 19 when
float8 quantization is requested on older models. Extend the same
pattern to int16/uint16: when the user requests QUInt16 or QInt16
weight quantization and the model's opset is below 21, bump to 21 so
that native ONNX QuantizeLinear/DequantizeLinear can be emitted
instead of silently falling back to the com.microsoft contrib domain.

Fixes #25223
---
 .../python/tools/quantization/quant_utils.py  |  9 +++++++
 .../python/quantization/test_quant_util.py    | 27 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 0ce1e1a0d75de..9b07b7bdf8e22 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -978,6 +978,15 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt
         )
         target_opset_version = 19
 
+    elif opset_version < 21 and weight_quant_type in (onnx.TensorProto.UINT16, onnx.TensorProto.INT16):
+        logging.warning(
+            f"The original model opset version is {opset_version}, which does not support 16-bit integer "
+            "quantization with native ONNX QuantizeLinear/DequantizeLinear. "
+            "Please update the model to opset >= 21. Automatically update the model to opset 21. "
+            "Please verify the quantized model."
+        )
+        target_opset_version = 21
+
     elif opset_version == 10:
         logging.warning(
             f"The original model opset version is {opset_version}, which does not support node fusions. "
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 468f97c980ad8..12f2afb221005 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -15,11 +15,13 @@
 from onnx import TensorProto, helper, numpy_helper
 
 from onnxruntime.quantization.quant_utils import (
+    QuantType,
     compute_scale_zp,
     load_model_with_shape_infer,
     model_has_infer_metadata,
     pack_bytes_to_4bit,
     quantize_data,
+    update_opset_version,
 )
 
 
@@ -173,6 +175,31 @@ def test_quantize_data_4bit(self):
 
                     self.assertEqual(numpy.array(actual_quant_val), expected_quant_val)
 
+    def test_update_opset_version_16bit(self):
+        graph = helper.make_graph([], "test_graph", [], [])
+
+        # 16-bit types should auto-bump opset < 21 -> 21
+        for weight_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(weight_type=label, opset=20):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+                result = update_opset_version(model, weight_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
+        # Already at opset 21 - should stay at 21
+        for weight_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(weight_type=label, opset=21):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
+                result = update_opset_version(model, weight_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
 
 if __name__ == "__main__":
     unittest.main()

From 52218ce7afee1b9e4f7bd360ce6aaf889042d1bb Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Fri, 1 May 2026 00:54:48 +0000
Subject: [PATCH 2/7] fix(quantization): bump opset for int16 activations too

update_opset_version previously only inspected weight_type, so a config
like activation_type=QInt16 with weight_type=QInt8 would not trigger the
opset>=21 bump and could produce a model with int16 Q/DQ on opset<21.
Extend the helper to accept activation_type and bump when either is
INT16/UINT16. Update the quantize_static call site and add subtests
covering 16-bit-activation-only, 16-bit-weight-only, both-8bit, and
backward-compat (single-arg call) cases.
---
 .../python/tools/quantization/quant_utils.py  | 12 +++++++++--
 .../python/tools/quantization/quantize.py     |  2 +-
 .../python/quantization/test_quant_util.py    | 20 ++++++++++++++++++-
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 9b07b7bdf8e22..73c9e322297cd 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -965,10 +965,18 @@ def get_opset_version(model: ModelProto) -> int:
     return opset_version
 
 
-def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProto:
+def update_opset_version(
+    model: ModelProto, weight_type: QuantType, activation_type: QuantType | None = None
+) -> ModelProto:
     opset_version = get_opset_version(model)
     target_opset_version = opset_version
     weight_quant_type = getattr(weight_type, "tensor_type", weight_type)
+    activation_quant_type = (
+        getattr(activation_type, "tensor_type", activation_type) if activation_type is not None else None
+    )
+
+    _int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16)
+    needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types
 
     if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
         logging.warning(
@@ -978,7 +986,7 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt
         )
         target_opset_version = 19
 
-    elif opset_version < 21 and weight_quant_type in (onnx.TensorProto.UINT16, onnx.TensorProto.INT16):
+    elif opset_version < 21 and needs_opset21_for_16bit:
         logging.warning(
             f"The original model opset version is {opset_version}, which does not support 16-bit integer "
             "quantization with native ONNX QuantizeLinear/DequantizeLinear. "
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index b8b239b85e7ad..a3aab06c5a935 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -699,7 +699,7 @@ def inc_dataloader():
         nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
-    updated_model = update_opset_version(model, weight_type)
+    updated_model = update_opset_version(model, weight_type, activation_type)
     is_model_updated = updated_model is not model
     if is_model_updated:
         model = updated_model
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 12f2afb221005..d17a9f8eaf457 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -178,7 +178,7 @@ def test_quantize_data_4bit(self):
     def test_update_opset_version_16bit(self):
         graph = helper.make_graph([], "test_graph", [], [])
 
-        # 16-bit types should auto-bump opset < 21 -> 21
+        # 16-bit weight type alone should auto-bump opset < 21 -> 21
         for weight_type, label in (
             (QuantType.QUInt16, "QUInt16"),
             (QuantType.QInt16, "QInt16"),
@@ -200,6 +200,24 @@ def test_update_opset_version_16bit(self):
                 result_opset = result.opset_import[0].version
                 self.assertEqual(result_opset, 21)
 
+        # 16-bit activation type with 8-bit weight should also bump opset < 21 -> 21
+        for activation_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(activation_type=label, weight_type="QInt8", opset=20):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+                result = update_opset_version(model, QuantType.QInt8, activation_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
+        # Both 8-bit should NOT bump to 21
+        with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20):
+            model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+            result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8)
+            result_opset = result.opset_import[0].version
+            self.assertNotEqual(result_opset, 21)
+
 
 if __name__ == "__main__":
     unittest.main()

From ef62c23adc3da9c5ba528ff858c54a88d7e9daf7 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Sun, 3 May 2026 11:51:51 +0000
Subject: [PATCH 3/7] fix(quantization): do not pre-set UseQDQContribOps for
 int16 types in get_qdq_config

get_qdq_config() was auto-setting extra_options["UseQDQContribOps"] = True
whenever activation_type or weight_type was INT16/UINT16 and the model opset
was < 21. This caused the config-based quantize(..., StaticQuantConfig) path
to emit com.microsoft Q/DQ ops even after quantize_static() bumped the model
to opset 21, where native ONNX QuantizeLinear/DequantizeLinear supports
INT16/UINT16 natively.

Narrow the condition so that UseQDQContribOps is only auto-set for 4-bit types
(which have no opset bump) and for tensor-override-based types; 16-bit top-level
weight/activation types are excluded because the opset-21 bump in quantize_static()
already handles them. An explicit user-supplied UseQDQContribOps in extra_options
still takes precedence via the existing override merge.

Update test_get_qdq_config.py: rename and fix the int16-opset19 subtest to assert
the new correct behavior (no contrib-ops flag), and add an end-to-end test that
verifies the config path produces an opset-21 model with native-domain Q/DQ nodes.
Tighten the existing no-op subtest in test_quant_util.py from assertNotEqual to
assertEqual(result_opset, 20) for a stricter regression guard.
---
 .../python/tools/quantization/quantize.py     | 10 +++-
 .../quantization/test_get_qdq_config.py       | 56 +++++++++++++++++--
 .../python/quantization/test_quant_util.py    |  4 +-
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index a3aab06c5a935..45c8a3005bceb 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -369,13 +369,17 @@ def get_qdq_config(
         }
         final_extra_options.update(calib_extra_options)
 
-    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
-    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    # ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 4-bit quantization.  16-bit weight/activation types are excluded here
+    # because quantize_static() will automatically bump the model opset to 21, where native ONNX
+    # QuantizeLinear/DequantizeLinear supports INT16/UINT16 without contrib-domain ops.
     onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
     if onnx_opset.version < 21:
         opset21_types = q16_types.union(q4_types)
         overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
-        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+        # Only set UseQDQContribOps for 4-bit types; 16-bit types are handled by the opset bump.
+        needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_opset21_types
+        if needs_contrib_ops:
             final_extra_options["UseQDQContribOps"] = True
 
     # Allow user's extra_options to override our final_extra_options.
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index 4a71b3694822c..317b9c4b153a1 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -271,10 +271,12 @@ def test_external_data(self):
         self.assertIsNotNone(weight_quantized)
         self.assertEqual(weight_quantized.data_location, onnx.TensorProto.EXTERNAL)
 
-    def test_use_qdq_contrib_ops_for_int16_opset19(self):
+    def test_no_qdq_contrib_ops_for_int16_opset_lt21(self):
         """
-        Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for
-        use of int16 in opset < 21.
+        Test that get_qdq_config() does NOT set UseQDQContribOps for int16 types even when
+        the model opset is < 21.  quantize_static() will bump the opset to 21 automatically,
+        where native ONNX QuantizeLinear/DequantizeLinear supports INT16/UINT16, so contrib-
+        domain ops are not needed.
         """
 
         shape = [1, 8, 8]
@@ -297,7 +299,53 @@ def test_use_qdq_contrib_ops_for_int16_opset19(self):
         )
 
         self.assertEqual(qdq_config.activation_type, QuantType.QUInt16)
-        self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])
+        # UseQDQContribOps must NOT be auto-set for 16-bit types; the opset bump handles them.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
+    def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self):
+        """
+        Test that the config-based quantize() path produces a model at opset 21 using native
+        ONNX QuantizeLinear/DequantizeLinear (not com.microsoft domain) when int16 activation
+        types are requested on a model whose original opset is < 21.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 20 (< 21) with int16 activation type
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=20)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt16,
+            weight_type=QuantType.QInt8,
+        )
+
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_int16_opset20_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx")
+        self.assertEqual(onnx_opset.version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
 
     def test_use_qdq_contrib_ops_for_int4_opset19(self):
         """
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index d17a9f8eaf457..16645c3b8a5d7 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -211,12 +211,12 @@ def test_update_opset_version_16bit(self):
                 result_opset = result.opset_import[0].version
                 self.assertEqual(result_opset, 21)
 
-        # Both 8-bit should NOT bump to 21
+        # Both 8-bit should NOT bump to 21; opset stays at 20
         with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20):
             model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
             result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8)
             result_opset = result.opset_import[0].version
-            self.assertNotEqual(result_opset, 21)
+            self.assertEqual(result_opset, 20)
 
 
 if __name__ == "__main__":

From 1d4160a7409d64595654f55fee13625ec1525e16 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Mon, 4 May 2026 11:50:22 +0000
Subject: [PATCH 4/7] fix(quantization): scan TensorQuantOverrides for 16-bit
 and recompute UseQDQContribOps after opset bump

- Extend opset-21 bump helper to inspect TensorQuantOverrides (including
  per-tensor convert.quant_type) for QInt16/QUInt16, so models with default
  8-bit base types but 16-bit overrides also get the native opset-21 path.
- Generalize the opset-bump warning text so it is accurate for both QDQ
  static and quantize_dynamic flows.
- Recompute UseQDQContribOps after the opset bump so 16-bit/4-bit overrides
  no longer latch the model to com.microsoft Q/DQ post-bump.
- Add regression tests for opset<21 + 16-bit overrides and mixed 16-bit/4-bit
  overrides via TensorQuantOverrides.
---
 .../python/tools/quantization/quant_utils.py  | 25 +++++-
 .../python/tools/quantization/quantize.py     | 27 ++++--
 .../quantization/test_get_qdq_config.py       | 88 +++++++++++++++++++
 3 files changed, 130 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 73c9e322297cd..3bc39ed0890b8 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -966,7 +966,10 @@ def get_opset_version(model: ModelProto) -> int:
 
 
 def update_opset_version(
-    model: ModelProto, weight_type: QuantType, activation_type: QuantType | None = None
+    model: ModelProto,
+    weight_type: QuantType,
+    activation_type: QuantType | None = None,
+    tensor_quant_overrides: dict | None = None,
 ) -> ModelProto:
     opset_version = get_opset_version(model)
     target_opset_version = opset_version
@@ -978,6 +981,24 @@ def update_opset_version(
     _int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16)
     needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types
 
+    # Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type.
+    if not needs_opset21_for_16bit and tensor_quant_overrides:
+        _int16_quant_types = {QuantType.QInt16, QuantType.QUInt16}
+        for overrides_list in tensor_quant_overrides.values():
+            for override in overrides_list:
+                qt = override.get("quant_type")
+                if qt in _int16_quant_types:
+                    needs_opset21_for_16bit = True
+                    break
+                convert = override.get("convert")
+                if convert is not None:
+                    convert_qt = convert.get("quant_type")
+                    if convert_qt in _int16_quant_types:
+                        needs_opset21_for_16bit = True
+                        break
+            if needs_opset21_for_16bit:
+                break
+
     if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
         logging.warning(
             f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
@@ -989,7 +1010,7 @@ def update_opset_version(
     elif opset_version < 21 and needs_opset21_for_16bit:
         logging.warning(
             f"The original model opset version is {opset_version}, which does not support 16-bit integer "
-            "quantization with native ONNX QuantizeLinear/DequantizeLinear. "
+            "quantization natively. "
             "Please update the model to opset >= 21. Automatically update the model to opset 21. "
             "Please verify the quantized model."
         )
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 45c8a3005bceb..010836900feea 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -372,15 +372,21 @@ def get_qdq_config(
     # ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain
     # on Q/DQ operators if using 4-bit quantization.  16-bit weight/activation types are excluded here
     # because quantize_static() will automatically bump the model opset to 21, where native ONNX
-    # QuantizeLinear/DequantizeLinear supports INT16/UINT16 without contrib-domain ops.
+    # QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops.
+    # 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit
+    # override config will be served at opset 21 where neither type needs contrib ops.
     onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
     if onnx_opset.version < 21:
-        opset21_types = q16_types.union(q4_types)
-        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
-        # Only set UseQDQContribOps for 4-bit types; 16-bit types are handled by the opset bump.
-        needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_opset21_types
-        if needs_contrib_ops:
-            final_extra_options["UseQDQContribOps"] = True
+        override_types = overrides_helper.get_quant_types()
+        overrides_have_16bit = any(t in q16_types for t in override_types)
+        # If any 16-bit type is present (top-level or override), quantize_static() will bump the
+        # model to opset 21, making contrib ops unnecessary for all types.
+        will_bump_to_opset21 = activation_type in q16_types or weight_type in q16_types or overrides_have_16bit
+        if not will_bump_to_opset21:
+            overrides_have_q4_types = any(t in q4_types for t in override_types)
+            needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_q4_types
+            if needs_contrib_ops:
+                final_extra_options["UseQDQContribOps"] = True
 
     # Allow user's extra_options to override our final_extra_options.
     if extra_options:
@@ -703,7 +709,12 @@ def inc_dataloader():
         nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
-    updated_model = update_opset_version(model, weight_type, activation_type)
+    updated_model = update_opset_version(
+        model,
+        weight_type,
+        activation_type,
+        tensor_quant_overrides=extra_options.get("TensorQuantOverrides"),
+    )
     is_model_updated = updated_model is not model
     if is_model_updated:
         model = updated_model
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index 317b9c4b153a1..9d19e2da53d17 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -377,6 +377,94 @@ def test_use_qdq_contrib_ops_for_int4_opset19(self):
         self.assertEqual(qdq_config.extra_options["TensorQuantOverrides"]["weight"][0]["quant_type"], QuantType.QInt4)
         self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])
 
+    def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self):
+        """
+        Regression test: when TensorQuantOverrides request a 16-bit type on a model whose opset is
+        < 21, the quantized model must be bumped to opset 21 and UseQDQContribOps must NOT be set.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 18 (< 21) so the opset bump is required.
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=18)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        # Override the weight to use QUInt16 via TensorQuantOverrides; top-level types are 8-bit.
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QInt8,
+            tensor_quant_overrides={"weight": [{"quant_type": QuantType.QUInt16}]},
+        )
+
+        # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
+        # where native ONNX Q/DQ ops handle all types.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_override_uint16_opset18_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx")
+        self.assertEqual(onnx_opset.version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
+
+    def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self):
+        """
+        Regression test: when TensorQuantOverrides contain both a 16-bit type (for one tensor) and
+        a 4-bit type (for another tensor) on a model whose opset is < 21, UseQDQContribOps must NOT
+        be set because the 16-bit override triggers an opset bump to 21 where all types are native.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 18 (< 21).
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=18)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        # Override: weight uses QUInt16 (16-bit, triggers opset bump), input_0 uses QInt4 (4-bit).
+        # The presence of the 16-bit override means the model is bumped to opset 21, so native
+        # Q/DQ ops handle everything — UseQDQContribOps must NOT be set.
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QInt8,
+            tensor_quant_overrides={
+                "weight": [{"quant_type": QuantType.QUInt16}],
+                "input_0": [{"quant_type": QuantType.QInt4}],
+            },
+        )
+
+        # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
+        # making native Q/DQ ops sufficient for all types including the 4-bit one.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
 
 if __name__ == "__main__":
     unittest.main()

From d1ae1ab1a8cbd3042c5ce17bcd32e56661820f42 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Tue, 5 May 2026 12:01:05 +0000
Subject: [PATCH 5/7] fix: harden override scan and update tests for opset-21
 bump

Address review feedback on the int16/uint16 QDQ opset auto-bump:

- Wrap the TensorQuantOverrides scan loop in a try/except for
  (AttributeError, TypeError) so malformed input falls through to the
  existing TensorQuantOverridesHelper.is_valid() ValueError instead of
  raising an unrelated AttributeError on .get() calls.
- Rename test_16bit_overrides_set_ms_domain to
  test_16bit_overrides_bump_opset_to_21 and flip its assertions to
  match the new behavior (opset bumped to 21, native ai.onnx Q/DQ).
- Add test_16bit_convert_quant_type_bumps_opset_to_21 covering the
  convert.quant_type branch with an opset-20 model, ensuring the bump
  fires for the convert sub-dict path as well as top-level overrides.
---
 .../python/tools/quantization/quant_utils.py  | 28 +++++----
 .../test_tensor_quant_overrides_option.py     | 62 ++++++++++++++++---
 2 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 3bc39ed0890b8..cbf0605564172 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -982,22 +982,26 @@ def update_opset_version(
     needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types
 
     # Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type.
+    # Validation of structure is deferred to TensorQuantOverridesHelper.is_valid(); skip bump heuristic on malformed input.
     if not needs_opset21_for_16bit and tensor_quant_overrides:
         _int16_quant_types = {QuantType.QInt16, QuantType.QUInt16}
-        for overrides_list in tensor_quant_overrides.values():
-            for override in overrides_list:
-                qt = override.get("quant_type")
-                if qt in _int16_quant_types:
-                    needs_opset21_for_16bit = True
-                    break
-                convert = override.get("convert")
-                if convert is not None:
-                    convert_qt = convert.get("quant_type")
-                    if convert_qt in _int16_quant_types:
+        try:
+            for overrides_list in tensor_quant_overrides.values():
+                for override in overrides_list:
+                    qt = override.get("quant_type")
+                    if qt in _int16_quant_types:
                         needs_opset21_for_16bit = True
                         break
-            if needs_opset21_for_16bit:
-                break
+                    convert = override.get("convert")
+                    if convert is not None:
+                        convert_qt = convert.get("quant_type")
+                        if convert_qt in _int16_quant_types:
+                            needs_opset21_for_16bit = True
+                            break
+                if needs_opset21_for_16bit:
+                    break
+        except (AttributeError, TypeError):
+            pass
 
     if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
         logging.warning(
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 520f589187585..6a402cd7ae96a 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -436,11 +436,13 @@ def test_qdq_overrides_per_channel2(self):
                     self.assertEqual(zp, expected_zp)
                     self.assertEqual(scale, np.float32(expected_scale))
 
-    def test_16bit_overrides_set_ms_domain(self):
+    def test_16bit_overrides_bump_opset_to_21(self):
         """
-        Test that overriding a tensor to 16bit (when default is 8bit) automatically
-        sets the 'com.microsoft' domain on DQ and Q ops for opset < 21.
-        Before ONNX 1.16.0, we had to use the 'com.microsoft' domain to be able to use 16-bit quantization.
+        Test that overriding a tensor to 16-bit (when default is 8-bit) automatically bumps the model
+        opset to 21 and emits native ai.onnx Q/DQ ops (not 'com.microsoft' domain ops).
+
+        Previously (before the opset-bump heuristic), a sub-opset-21 model with INT16 overrides would
+        use the 'com.microsoft' domain.  Now the model is auto-upgraded so the standard domain is used.
         """
         qdq_model_name = "model_quant_overrides_to_16bit.onnx"
         inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization(
@@ -459,14 +461,20 @@ def test_16bit_overrides_set_ms_domain(self):
         self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16)
         self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
 
-        # Output should the default uint8 type
+        # Output should be the default uint8 type
         self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8)
 
-        # Q/DQ ops should all have the 'com.microsoft' domain
+        # The model opset should have been auto-bumped to >= 21
         qdq_model = onnx.load_model(qdq_model_name)
+        ai_onnx_opset = next(
+            opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        )
+        self.assertGreaterEqual(ai_onnx_opset, 21)
+
+        # Q/DQ ops should be in the default domain (NOT 'com.microsoft')
         for node in qdq_model.graph.node:
             if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
-                self.assertEqual(node.domain, ms_domain)
+                self.assertNotEqual(node.domain, ms_domain)
 
     def test_16bit_overrides_not_set_ms_domain(self):
         """
@@ -500,6 +508,46 @@ def test_16bit_overrides_not_set_ms_domain(self):
             if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
                 self.assertNotEqual(node.domain, ms_domain)
 
+    def test_16bit_convert_quant_type_bumps_opset_to_21(self):
+        """
+        Regression test: a 16-bit type specified via the 'convert.quant_type' field inside
+        TensorQuantOverrides should also trigger the opset-21 auto-bump, even when the top-level
+        quant_type for that tensor is 8-bit.
+
+        Verifies that the resulting model has ai.onnx opset >= 21 and that QuantizeLinear /
+        DequantizeLinear nodes are in the default domain (not 'com.microsoft').
+        """
+        qdq_model_name = "model_quant_overrides_convert_16bit.onnx"
+        inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization(
+            qdq_model_name,
+            activation_type=onnx.TensorProto.UINT8,  # Default to 8bit activations
+            extra_options={
+                "TensorQuantOverrides": {
+                    # quant_type is 8-bit; the 16-bit is only in the convert sub-dict
+                    "INP": [{"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QInt16}}],
+                }
+            },
+            opset=20,
+        )
+
+        # INP primary quant type stays uint8
+        self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT8)
+
+        # Output should be the default uint8 type
+        self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8)
+
+        # The model opset should have been auto-bumped to >= 21 due to convert.quant_type = QInt16
+        qdq_model = onnx.load_model(qdq_model_name)
+        ai_onnx_opset = next(
+            opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        )
+        self.assertGreaterEqual(ai_onnx_opset, 21)
+
+        # Q/DQ ops should be in the default domain (NOT 'com.microsoft')
+        for node in qdq_model.graph.node:
+            if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+                self.assertNotEqual(node.domain, ms_domain)
+
     def test_override_validation_nonexisting_tensor(self):
         """
         Test that specifying a non-existing tensor should fail.

From 140d63120cb04012e6092b0dd80b4b7a49b236b6 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Mon, 11 May 2026 11:26:39 +0000
Subject: [PATCH 6/7] Log debug message instead of silent pass on malformed
 overrides

Replaces an empty except block in the 16-bit opset bump heuristic with
a logging.debug call so callers can observe when a structurally
malformed TensorQuantOverrides causes the heuristic to be skipped.
Addresses CodeQL 'Empty except' finding and review feedback on PR
#28202.
---
 onnxruntime/python/tools/quantization/quant_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index cbf0605564172..81a6aaa892280 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -1001,7 +1001,9 @@ def update_opset_version(
                 if needs_opset21_for_16bit:
                     break
         except (AttributeError, TypeError):
-            pass
+            # Malformed overrides; structural validation is deferred to
+            # TensorQuantOverridesHelper.is_valid(). Skip bump heuristic.
+            logging.debug("Skipping 16-bit opset bump heuristic for TensorQuantOverrides: structure not as expected.")
 
     if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
         logging.warning(

From da1271c8cefcb6b867387193445d45585e1b180a Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Tue, 12 May 2026 11:11:08 +0000
Subject: [PATCH 7/7] fix(quant): guard extra_options None and strengthen
 16-bit opset tests

Address review feedback on 16-bit QDQ opset bump:
- Guard extra_options against None in quantize() call path
- Use get_opset_version() helper for clearer test failures
- Assert default ai.onnx domain for Q/DQ nodes
- Extend get_qdq_config test to invoke quantize end-to-end
  and verify output opset==21 with default-domain Q/DQ ops
---
 .../python/tools/quantization/quantize.py     |  7 +++--
 .../quantization/test_get_qdq_config.py       | 27 ++++++++++++++---
 .../test_tensor_quant_overrides_option.py     | 30 ++++++++++++-------
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 010836900feea..7a887874b6e74 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -22,6 +22,7 @@
     QuantFormat,
     QuantizationMode,
     QuantType,
+    get_opset_version,
     load_model_with_shape_infer,
     model_has_pre_process_metadata,
     save_and_reload_model_with_shape_infer,
@@ -375,8 +376,8 @@ def get_qdq_config(
     # QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops.
     # 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit
     # override config will be served at opset 21 where neither type needs contrib ops.
-    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
-    if onnx_opset.version < 21:
+    onnx_opset_version = get_opset_version(model)
+    if onnx_opset_version < 21:
         override_types = overrides_helper.get_quant_types()
         overrides_have_16bit = any(t in q16_types for t in override_types)
         # If any 16-bit type is present (top-level or override), quantize_static() will bump the
@@ -713,7 +714,7 @@ def inc_dataloader():
         model,
         weight_type,
         activation_type,
-        tensor_quant_overrides=extra_options.get("TensorQuantOverrides"),
+        tensor_quant_overrides=(extra_options or {}).get("TensorQuantOverrides"),
     )
     is_model_updated = updated_model is not model
     if is_model_updated:
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index 9d19e2da53d17..7a0251ac06c0c 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -15,6 +15,7 @@
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
 
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, get_qdq_config, quantize
+from onnxruntime.quantization.quant_utils import get_opset_version
 
 
 class TestGetQDQConfig(unittest.TestCase):
@@ -335,8 +336,8 @@ def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self):
         qdq_model = onnx.load_model(qdq_model_path)
 
         # The quantized model must have been bumped to opset 21.
-        onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx")
-        self.assertEqual(onnx_opset.version, 21)
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertEqual(onnx_opset_version, 21)
 
         # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
         for node in qdq_model.graph.node:
@@ -415,8 +416,8 @@ def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self):
         qdq_model = onnx.load_model(qdq_model_path)
 
         # The quantized model must have been bumped to opset 21.
-        onnx_opset = next(x for x in qdq_model.opset_import if not x.domain or x.domain == "ai.onnx")
-        self.assertEqual(onnx_opset.version, 21)
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertEqual(onnx_opset_version, 21)
 
         # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
         for node in qdq_model.graph.node:
@@ -465,6 +466,24 @@ def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self):
         # making native Q/DQ ops sufficient for all types including the 4-bit one.
         self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
 
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_mixed_16bit_4bit_opset18_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertGreaterEqual(onnx_opset_version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 6a402cd7ae96a..27500679b496a 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -15,7 +15,7 @@
 
 from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType, quantize_static
 from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
-from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType, ms_domain
+from onnxruntime.quantization.quant_utils import compute_scale_zp, get_opset_version, get_qmin_qmax_for_qType
 
 
 class DummyDataReader(CalibrationDataReader):
@@ -466,15 +466,17 @@ def test_16bit_overrides_bump_opset_to_21(self):
 
         # The model opset should have been auto-bumped to >= 21
         qdq_model = onnx.load_model(qdq_model_name)
-        ai_onnx_opset = next(
-            opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx"
-        )
+        ai_onnx_opset = get_opset_version(qdq_model)
         self.assertGreaterEqual(ai_onnx_opset, 21)
 
         # Q/DQ ops should be in the default domain (NOT 'com.microsoft')
         for node in qdq_model.graph.node:
             if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
-                self.assertNotEqual(node.domain, ms_domain)
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
 
     def test_16bit_overrides_not_set_ms_domain(self):
         """
@@ -502,11 +504,15 @@ def test_16bit_overrides_not_set_ms_domain(self):
         # Output should the default uint8 type
         self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8)
 
-        # Q/DQ ops should all have the 'com.microsoft' domain
+        # Q/DQ ops should be in the default domain (NOT 'com.microsoft')
         qdq_model = onnx.load_model(qdq_model_name)
         for node in qdq_model.graph.node:
             if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
-                self.assertNotEqual(node.domain, ms_domain)
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
 
     def test_16bit_convert_quant_type_bumps_opset_to_21(self):
         """
@@ -538,15 +544,17 @@ def test_16bit_convert_quant_type_bumps_opset_to_21(self):
 
         # The model opset should have been auto-bumped to >= 21 due to convert.quant_type = QInt16
         qdq_model = onnx.load_model(qdq_model_name)
-        ai_onnx_opset = next(
-            opset.version for opset in qdq_model.opset_import if not opset.domain or opset.domain == "ai.onnx"
-        )
+        ai_onnx_opset = get_opset_version(qdq_model)
         self.assertGreaterEqual(ai_onnx_opset, 21)
 
         # Q/DQ ops should be in the default domain (NOT 'com.microsoft')
         for node in qdq_model.graph.node:
             if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
-                self.assertNotEqual(node.domain, ms_domain)
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
 
     def test_override_validation_nonexisting_tensor(self):
         """