Skip to content
46 changes: 45 additions & 1 deletion onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,10 +965,45 @@ def get_opset_version(model: ModelProto) -> int:
return opset_version


def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProto:
def update_opset_version(
model: ModelProto,
weight_type: QuantType,
activation_type: QuantType | None = None,
tensor_quant_overrides: dict | None = None,
) -> ModelProto:
opset_version = get_opset_version(model)
target_opset_version = opset_version
weight_quant_type = getattr(weight_type, "tensor_type", weight_type)
activation_quant_type = (
getattr(activation_type, "tensor_type", activation_type) if activation_type is not None else None
)

_int16_types = (onnx.TensorProto.UINT16, onnx.TensorProto.INT16)
needs_opset21_for_16bit = weight_quant_type in _int16_types or activation_quant_type in _int16_types
Comment thread
tianleiwu marked this conversation as resolved.
Comment thread
tianleiwu marked this conversation as resolved.

# Also check TensorQuantOverrides for any 16-bit types, including per-override convert.quant_type.
# Validation of structure is deferred to TensorQuantOverridesHelper.is_valid(); skip bump heuristic on malformed input.
if not needs_opset21_for_16bit and tensor_quant_overrides:
_int16_quant_types = {QuantType.QInt16, QuantType.QUInt16}
try:
for overrides_list in tensor_quant_overrides.values():
for override in overrides_list:
qt = override.get("quant_type")
if qt in _int16_quant_types:
needs_opset21_for_16bit = True
break
convert = override.get("convert")
if convert is not None:
convert_qt = convert.get("quant_type")
if convert_qt in _int16_quant_types:
needs_opset21_for_16bit = True
break
if needs_opset21_for_16bit:
break
except (AttributeError, TypeError):
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
Comment thread
tianleiwu marked this conversation as resolved.
# Malformed overrides; structural validation is deferred to
# TensorQuantOverridesHelper.is_valid(). Skip bump heuristic.
logging.debug("Skipping 16-bit opset bump heuristic for TensorQuantOverrides: structure not as expected.")

if opset_version < 19 and weight_quant_type == onnx.TensorProto.FLOAT8E4M3FN:
logging.warning(
Expand All @@ -978,6 +1013,15 @@ def update_opset_version(model: ModelProto, weight_type: QuantType) -> ModelProt
)
target_opset_version = 19

elif opset_version < 21 and needs_opset21_for_16bit:
logging.warning(
f"The original model opset version is {opset_version}, which does not support 16-bit integer "
"quantization natively. "
"Please update the model to opset >= 21. Automatically update the model to opset 21. "
"Please verify the quantized model."
Comment thread
tianleiwu marked this conversation as resolved.
)
target_opset_version = 21
Comment thread
tianleiwu marked this conversation as resolved.

elif opset_version == 10:
logging.warning(
f"The original model opset version is {opset_version}, which does not support node fusions. "
Expand Down
34 changes: 25 additions & 9 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
QuantFormat,
QuantizationMode,
QuantType,
get_opset_version,
load_model_with_shape_infer,
model_has_pre_process_metadata,
save_and_reload_model_with_shape_infer,
Expand Down Expand Up @@ -369,14 +370,24 @@ def get_qdq_config(
}
final_extra_options.update(calib_extra_options)

# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
# on Q/DQ operators if using 16-bit or 4-bit quantization.
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
if onnx_opset.version < 21:
opset21_types = q16_types.union(q4_types)
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
final_extra_options["UseQDQContribOps"] = True
# ONNX opset < 21 does not support 4-bit quantization natively, so must use 'com.microsoft' domain
# on Q/DQ operators if using 4-bit quantization. 16-bit weight/activation types are excluded here
# because quantize_static() will automatically bump the model opset to 21, where native ONNX
# QuantizeLinear/DequantizeLinear supports INT16/UINT16 and INT4/UINT4 without contrib-domain ops.
# 16-bit types in TensorQuantOverrides also trigger the same opset bump, so a mixed 16-bit + 4-bit
# override config will be served at opset 21 where neither type needs contrib ops.
onnx_opset_version = get_opset_version(model)
if onnx_opset_version < 21:
override_types = overrides_helper.get_quant_types()
overrides_have_16bit = any(t in q16_types for t in override_types)
# If any 16-bit type is present (top-level or override), quantize_static() will bump the
# model to opset 21, making contrib ops unnecessary for all types.
will_bump_to_opset21 = activation_type in q16_types or weight_type in q16_types or overrides_have_16bit
if not will_bump_to_opset21:
overrides_have_q4_types = any(t in q4_types for t in override_types)
needs_contrib_ops = activation_type in q4_types or weight_type in q4_types or overrides_have_q4_types
if needs_contrib_ops:
final_extra_options["UseQDQContribOps"] = True

# Allow user's extra_options to override our final_extra_options.
if extra_options:
Expand Down Expand Up @@ -699,7 +710,12 @@ def inc_dataloader():
nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration

updated_model = update_opset_version(model, weight_type)
updated_model = update_opset_version(
model,
weight_type,
activation_type,
tensor_quant_overrides=(extra_options or {}).get("TensorQuantOverrides"),
)
Comment thread
tianleiwu marked this conversation as resolved.
is_model_updated = updated_model is not model
if is_model_updated:
model = updated_model
Expand Down
163 changes: 159 additions & 4 deletions onnxruntime/test/python/quantization/test_get_qdq_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count

from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, get_qdq_config, quantize
from onnxruntime.quantization.quant_utils import get_opset_version


class TestGetQDQConfig(unittest.TestCase):
Expand Down Expand Up @@ -271,10 +272,12 @@ def test_external_data(self):
self.assertIsNotNone(weight_quantized)
self.assertEqual(weight_quantized.data_location, onnx.TensorProto.EXTERNAL)

def test_use_qdq_contrib_ops_for_int16_opset19(self):
def test_no_qdq_contrib_ops_for_int16_opset_lt21(self):
"""
Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for
use of int16 in opset < 21.
Test that get_qdq_config() does NOT set UseQDQContribOps for int16 types even when
the model opset is < 21. quantize_static() will bump the opset to 21 automatically,
where native ONNX QuantizeLinear/DequantizeLinear supports INT16/UINT16, so contrib-
domain ops are not needed.
"""

shape = [1, 8, 8]
Expand All @@ -297,7 +300,53 @@ def test_use_qdq_contrib_ops_for_int16_opset19(self):
)

self.assertEqual(qdq_config.activation_type, QuantType.QUInt16)
self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])
# UseQDQContribOps must NOT be auto-set for 16-bit types; the opset bump handles them.
self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))

def test_quantize_via_config_int16_opset_lt21_uses_native_qdq(self):
"""
Test that the config-based quantize() path produces a model at opset 21 using native
ONNX QuantizeLinear/DequantizeLinear (not com.microsoft domain) when int16 activation
types are requested on a model whose original opset is < 21.
"""

shape = [1, 8, 8]
tensor_type = onnx.TensorProto.FLOAT
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
# Build a model at opset 20 (< 21) with int16 activation type
float_model = self.build_add_model(shape, tensor_type, weight, opset=20)

input_data_list = [
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
]
data_reader = TestDataFeeds(input_data_list)

qdq_config = get_qdq_config(
float_model,
data_reader,
activation_type=QuantType.QUInt16,
weight_type=QuantType.QInt8,
Comment thread
tianleiwu marked this conversation as resolved.
)

qdq_model_path = os.path.join(self._tmp_dir_path, "add_int16_opset20_qdq.onnx")
quantize(float_model, qdq_model_path, qdq_config)

qdq_model = onnx.load_model(qdq_model_path)

# The quantized model must have been bumped to opset 21.
onnx_opset_version = get_opset_version(qdq_model)
self.assertEqual(onnx_opset_version, 21)

# All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
for node in qdq_model.graph.node:
if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
self.assertEqual(
node.domain,
"",
f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
)

def test_use_qdq_contrib_ops_for_int4_opset19(self):
"""
Expand Down Expand Up @@ -329,6 +378,112 @@ def test_use_qdq_contrib_ops_for_int4_opset19(self):
self.assertEqual(qdq_config.extra_options["TensorQuantOverrides"]["weight"][0]["quant_type"], QuantType.QInt4)
self.assertTrue(qdq_config.extra_options["UseQDQContribOps"])

def test_overrides_16bit_opset_lt21_bumps_opset_no_contrib_ops(self):
"""
Regression test: when TensorQuantOverrides request a 16-bit type on a model whose opset is
< 21, the quantized model must be bumped to opset 21 and UseQDQContribOps must NOT be set.
"""

shape = [1, 8, 8]
tensor_type = onnx.TensorProto.FLOAT
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
# Build a model at opset 18 (< 21) so the opset bump is required.
float_model = self.build_add_model(shape, tensor_type, weight, opset=18)

input_data_list = [
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
]
data_reader = TestDataFeeds(input_data_list)

# Override the weight to use QUInt16 via TensorQuantOverrides; top-level types are 8-bit.
qdq_config = get_qdq_config(
float_model,
data_reader,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QInt8,
tensor_quant_overrides={"weight": [{"quant_type": QuantType.QUInt16}]},
)

# UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
# where native ONNX Q/DQ ops handle all types.
self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))

qdq_model_path = os.path.join(self._tmp_dir_path, "add_override_uint16_opset18_qdq.onnx")
quantize(float_model, qdq_model_path, qdq_config)

qdq_model = onnx.load_model(qdq_model_path)

# The quantized model must have been bumped to opset 21.
onnx_opset_version = get_opset_version(qdq_model)
self.assertEqual(onnx_opset_version, 21)

# All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
for node in qdq_model.graph.node:
if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
self.assertEqual(
node.domain,
"",
f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
)

def test_overrides_mixed_16bit_4bit_opset_lt21_no_contrib_ops(self):
"""
Regression test: when TensorQuantOverrides contain both a 16-bit type (for one tensor) and
a 4-bit type (for another tensor) on a model whose opset is < 21, UseQDQContribOps must NOT
be set because the 16-bit override triggers an opset bump to 21 where all types are native.
"""
Comment thread
tianleiwu marked this conversation as resolved.

shape = [1, 8, 8]
tensor_type = onnx.TensorProto.FLOAT
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
# Build a model at opset 18 (< 21).
float_model = self.build_add_model(shape, tensor_type, weight, opset=18)

input_data_list = [
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
{"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
]
data_reader = TestDataFeeds(input_data_list)

# Override: weight uses QUInt16 (16-bit, triggers opset bump), input_0 uses QInt4 (4-bit).
# The presence of the 16-bit override means the model is bumped to opset 21, so native
# Q/DQ ops handle everything — UseQDQContribOps must NOT be set.
qdq_config = get_qdq_config(
float_model,
data_reader,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QInt8,
tensor_quant_overrides={
"weight": [{"quant_type": QuantType.QUInt16}],
"input_0": [{"quant_type": QuantType.QInt4}],
},
)

# UseQDQContribOps must NOT be set: the 16-bit override triggers an opset bump to 21,
# making native Q/DQ ops sufficient for all types including the 4-bit one.
self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False))

qdq_model_path = os.path.join(self._tmp_dir_path, "add_mixed_16bit_4bit_opset18_qdq.onnx")
quantize(float_model, qdq_model_path, qdq_config)

qdq_model = onnx.load_model(qdq_model_path)

# The quantized model must have been bumped to opset 21.
onnx_opset_version = get_opset_version(qdq_model)
self.assertGreaterEqual(onnx_opset_version, 21)

# All Q/DQ nodes must use the default ONNX domain (not com.microsoft).
for node in qdq_model.graph.node:
if node.op_type in ("QuantizeLinear", "DequantizeLinear"):
self.assertEqual(
node.domain,
"",
f"Expected native ONNX domain for {node.op_type} but got '{node.domain}'",
)


if __name__ == "__main__":
unittest.main()
45 changes: 45 additions & 0 deletions onnxruntime/test/python/quantization/test_quant_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
from onnx import TensorProto, helper, numpy_helper

from onnxruntime.quantization.quant_utils import (
QuantType,
compute_scale_zp,
load_model_with_shape_infer,
model_has_infer_metadata,
pack_bytes_to_4bit,
quantize_data,
update_opset_version,
)


Expand Down Expand Up @@ -173,6 +175,49 @@ def test_quantize_data_4bit(self):

self.assertEqual(numpy.array(actual_quant_val), expected_quant_val)

def test_update_opset_version_16bit(self):
graph = helper.make_graph([], "test_graph", [], [])

# 16-bit weight type alone should auto-bump opset < 21 -> 21
for weight_type, label in (
(QuantType.QUInt16, "QUInt16"),
(QuantType.QInt16, "QInt16"),
):
with self.subTest(weight_type=label, opset=20):
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
result = update_opset_version(model, weight_type)
result_opset = result.opset_import[0].version
self.assertEqual(result_opset, 21)

# Already at opset 21 - should stay at 21
for weight_type, label in (
(QuantType.QUInt16, "QUInt16"),
(QuantType.QInt16, "QInt16"),
):
with self.subTest(weight_type=label, opset=21):
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
result = update_opset_version(model, weight_type)
result_opset = result.opset_import[0].version
self.assertEqual(result_opset, 21)

# 16-bit activation type with 8-bit weight should also bump opset < 21 -> 21
for activation_type, label in (
(QuantType.QUInt16, "QUInt16"),
(QuantType.QInt16, "QInt16"),
):
with self.subTest(activation_type=label, weight_type="QInt8", opset=20):
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
result = update_opset_version(model, QuantType.QInt8, activation_type)
result_opset = result.opset_import[0].version
self.assertEqual(result_opset, 21)

# Both 8-bit should NOT bump to 21; opset stays at 20
with self.subTest(weight_type="QInt8", activation_type="QUInt8", opset=20):
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
result = update_opset_version(model, QuantType.QInt8, QuantType.QUInt8)
result_opset = result.opset_import[0].version
self.assertEqual(result_opset, 20)


if __name__ == "__main__":
unittest.main()
Loading
Loading