microsoft · tianleiwu · May 14, 2026 · Apr 23, 2026 · May 1, 2026 · May 3, 2026
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -965,10 +965,45 @@ def get_opset_version(model: ModelProto) -> int:
     return opset_version
 
 
-def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProto:
+def update_opset_version(
+    model: ModelProto,
+    weight_type: QuantType,
+    activation_type: QuantType | None = None,
+    tensor_quant_overrides: dict | None = None,
+) -> ModelProto:
     opset_version = get_opset_version(model)
     target_opset_version = opset_version
     weight_quant_type = getattr(weight_type, "tensor_type", weight_type)
+    activation_quant_type = (
+        getattr(activation_type, "tensor_type", activation_type) if activation_type is not None else None
+    )
+
+    _int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16)
+    needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types
+
+    # Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type.
+    # Validation of structure is deferred to TensorQuantOverridesHelper.is_valid(); skip bump heuristic on malformed input.
+    if not needs_opset21_for_16bit and tensor_quant_overrides:
+        _int16_quant_types = {QuantType.QInt16, QuantType.QUInt16}
+        try:
+            for overrides_list in tensor_quant_overrides.values():
+                for override in overrides_list:
+                    qt = override.get("quant_type")
+                    if qt in _int16_quant_types:
+                        needs_opset21_for_16bit = True
+                        break
+                    convert = override.get("convert")
+                    if convert is not None:
+                        convert_qt = convert.get("quant_type")
+                        if convert_qt in _int16_quant_types:
+                            needs_opset21_for_16bit = True
+                            break
+                if needs_opset21_for_16bit:
+                    break
+        except (AttributeError, TypeError):
+            # Malformed overrides; structural validation is deferred to
+            # TensorQuantOverridesHelper.is_valid(). Skip bump heuristic.
+            logging.debug("Skipping 16-bit opset bump heuristic for TensorQuantOverrides: structure not as expected.")
 
     if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
         logging.warning(
@@ -978,6 +1013,15 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt
         )
         target_opset_version = 19
 
+    elif opset_version < 21 and needs_opset21_for_16bit:
+        logging.warning(
+            f"The original model opset version is {opset_version}, which does not support 16-bit integer "
+            "quantization natively. "
+            "Please update the model to opset >= 21. Automatically update the model to opset 21. "
+            "Please verify the quantized model."
+        )
+        target_opset_version = 21
+
     elif opset_version == 10:
         logging.warning(
             f"The original model opset version is {opset_version}, which does not support node fusions. "

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -22,6 +22,7 @@
     QuantFormat,
     QuantizationMode,
     QuantType,
+    get_opset_version,
     load_model_with_shape_infer,
     model_has_pre_process_metadata,
     save_and_reload_model_with_shape_infer,
@@ -369,14 +370,24 @@ def get_qdq_config(
         }
         final_extra_options.update(calib_extra_options)
 
-    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
-    # on Q/DQ operators if using 16-bit or 4-bit quantization.
-    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
-    if onnx_opset.version < 21:
-        opset21_types = q16_types.union(q4_types)
-        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
-        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
-            final_extra_options["UseQDQContribOps"] = True
+    # ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 4-bit quantization.  16-bit weight/activation types are excluded here
+    # because quantize_static() will automatically bump the model opset to 21, where native ONNX
+    # QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops.
+    # 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit
+    # override config will be served at opset 21 where neither type needs contrib ops.
+    onnx_opset_version = get_opset_version(model)
+    if onnx_opset_version < 21:
+        override_types = overrides_helper.get_quant_types()
+        overrides_have_16bit = any(t in q16_types for t in override_types)
+        # If any 16-bit type is present (top-level or override), quantize_static() will bump the
+        # model to opset 21, making contrib ops unnecessary for all types.
+        will_bump_to_opset21 = activation_type in q16_types or weight_type in q16_types or overrides_have_16bit
+        if not will_bump_to_opset21:
+            overrides_have_q4_types = any(t in q4_types for t in override_types)
+            needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_q4_types
+            if needs_contrib_ops:
+                final_extra_options["UseQDQContribOps"] = True
 
     # Allow user's extra_options to override our final_extra_options.
     if extra_options:
@@ -699,7 +710,12 @@ def inc_dataloader():
         nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
-    updated_model = update_opset_version(model, weight_type)
+    updated_model = update_opset_version(
+        model,
+        weight_type,
+        activation_type,
+        tensor_quant_overrides=(extra_options or {}).get("TensorQuantOverrides"),
+    )
     is_model_updated = updated_model is not model
     if is_model_updated:
         model = updated_model

diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -15,6 +15,7 @@
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
 
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, get_qdq_config, quantize
+from onnxruntime.quantization.quant_utils import get_opset_version
 
 
 class TestGetQDQConfig(unittest.TestCase):
@@ -271,10 +272,12 @@ def test_external_data(self):
         self.assertIsNotNone(weight_quantized)
         self.assertEqual(weight_quantized.data_location, onnx.TensorProto.EXTERNAL)
 
-    def test_use_qdq_contrib_ops_for_int16_opset19(self):
+    def test_no_qdq_contrib_ops_for_int16_opset_lt21(self):
         """
-        Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for
-        use of int16 in opset < 21.
+        Test that get_qdq_config() does NOT set UseQDQContribOps for int16 types even when
+        the model opset is < 21.  quantize_static() will bump the opset to 21 automatically,
+        where native ONNX QuantizeLinear/DequantizeLinear supports INT16/UINT16, so contrib-
+        domain ops are not needed.
         """
 
         shape = [1, 8, 8]
@@ -297,7 +300,53 @@ def test_use_qdq_contrib_ops_for_int16_opset19(self):
         )
 
         self.assertEqual(qdq_config.activation_type, QuantType.QUInt16)
-        self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])
+        # UseQDQContribOps must NOT be auto-set for 16-bit types; the opset bump handles them.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
+    def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self):
+        """
+        Test that the config-based quantize() path produces a model at opset 21 using native
+        ONNX QuantizeLinear/DequantizeLinear (not com.microsoft domain) when int16 activation
+        types are requested on a model whose original opset is < 21.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 20 (< 21) with int16 activation type
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=20)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt16,
+            weight_type=QuantType.QInt8,
+        )
+
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_int16_opset20_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertEqual(onnx_opset_version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
 
     def test_use_qdq_contrib_ops_for_int4_opset19(self):
         """
@@ -329,6 +378,112 @@ def test_use_qdq_contrib_ops_for_int4_opset19(self):
         self.assertEqual(qdq_config.extra_options["TensorQuantOverrides"]["weight"][0]["quant_type"], QuantType.QInt4)
         self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])
 
+    def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self):
+        """
+        Regression test: when TensorQuantOverrides request a 16-bit type on a model whose opset is
+        < 21, the quantized model must be bumped to opset 21 and UseQDQContribOps must NOT be set.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 18 (< 21) so the opset bump is required.
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=18)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        # Override the weight to use QUInt16 via TensorQuantOverrides; top-level types are 8-bit.
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QInt8,
+            tensor_quant_overrides={"weight": [{"quant_type": QuantType.QUInt16}]},
+        )
+
+        # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
+        # where native ONNX Q/DQ ops handle all types.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_override_uint16_opset18_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertEqual(onnx_opset_version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
+
+    def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self):
+        """
+        Regression test: when TensorQuantOverrides contain both a 16-bit type (for one tensor) and
+        a 4-bit type (for another tensor) on a model whose opset is < 21, UseQDQContribOps must NOT
+        be set because the 16-bit override triggers an opset bump to 21 where all types are native.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        # Build a model at opset 18 (< 21).
+        float_model = self.build_add_model(shape, tensor_type, weight, opset=18)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        # Override: weight uses QUInt16 (16-bit, triggers opset bump), input_0 uses QInt4 (4-bit).
+        # The presence of the 16-bit override means the model is bumped to opset 21, so native
+        # Q/DQ ops handle everything — UseQDQContribOps must NOT be set.
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QInt8,
+            tensor_quant_overrides={
+                "weight": [{"quant_type": QuantType.QUInt16}],
+                "input_0": [{"quant_type": QuantType.QInt4}],
+            },
+        )
+
+        # UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
+        # making native Q/DQ ops sufficient for all types including the 4-bit one.
+        self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))
+
+        qdq_model_path = os.path.join(self._tmp_dir_path, "add_mixed_16bit_4bit_opset18_qdq.onnx")
+        quantize(float_model, qdq_model_path, qdq_config)
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        # The quantized model must have been bumped to opset 21.
+        onnx_opset_version = get_opset_version(qdq_model)
+        self.assertGreaterEqual(onnx_opset_version, 21)
+
+        # All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
+        for node in qdq_model.graph.node:
+            if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
+                self.assertEqual(
+                    node.domain,
+                    "",
+                    f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
+                )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -15,11 +15,13 @@
 from onnx import TensorProto, helper, numpy_helper
 
 from onnxruntime.quantization.quant_utils import (
+    QuantType,
     compute_scale_zp,
     load_model_with_shape_infer,
     model_has_infer_metadata,
     pack_bytes_to_4bit,
     quantize_data,
+    update_opset_version,
 )
 
 
@@ -173,6 +175,49 @@ def test_quantize_data_4bit(self):
 
                     self.assertEqual(numpy.array(actual_quant_val), expected_quant_val)
 
+    def test_update_opset_version_16bit(self):
+        graph = helper.make_graph([], "test_graph", [], [])
+
+        # 16-bit weight type alone should auto-bump opset < 21 -> 21
+        for weight_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(weight_type=label, opset=20):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+                result = update_opset_version(model, weight_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
+        # Already at opset 21 - should stay at 21
+        for weight_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(weight_type=label, opset=21):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
+                result = update_opset_version(model, weight_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
+        # 16-bit activation type with 8-bit weight should also bump opset < 21 -> 21
+        for activation_type, label in (
+            (QuantType.QUInt16, "QUInt16"),
+            (QuantType.QInt16, "QInt16"),
+        ):
+            with self.subTest(activation_type=label, weight_type="QInt8", opset=20):
+                model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+                result = update_opset_version(model, QuantType.QInt8, activation_type)
+                result_opset = result.opset_import[0].version
+                self.assertEqual(result_opset, 21)
+
+        # Both 8-bit should NOT bump to 21; opset stays at 20
+        with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20):
+            model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
+            result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8)
+            result_opset = result.opset_import[0].version
+            self.assertEqual(result_opset, 20)
+
 
 if __name__ == "__main__":
     unittest.main()