From cb43b7c75fbcfaf197f1acad54c62145c6029974 Mon Sep 17 00:00:00 2001 From: Max Buckley Date: Wed, 20 May 2026 16:18:17 +0200 Subject: [PATCH 1/5] [CoreML EP] Support bool Cast in ML Program Two changes to the ML Program Cast builder: 1. Accept BOOL as a source and target dtype in HasSupportedInputsImpl. The ML Program `cast` op already handles bool, and AddToModelBuilderImpl already maps `to == BOOL`; only the input/output type gate omitted it. This lets int64<->bool<->float casts (transformer attention-mask graphs) stay on CoreML. 2. Move the "no preceding node" check after the ML Program early-return. It was legacy gating for the NeuralNetwork ArgMax-only path (which dereferences InputEdgesBegin()); on the ML Program path a Cast fed directly by a graph input is fine, and rejecting it forced needless CPU fallback. Tests (coreml_basic_test.cc): - CastBoolRoundTrip_MLProgram: an int64->bool->float cast chain runs fully on CoreML and matches the CPU reference. The bool tensor is internal (a CoreML partition cannot have bool I/O) and the first Cast is graph-input fed. - CastNonArgMaxNeuralNetworkNotSupported: the same chain falls back to CPU on the NeuralNetwork format, guarding the IsOpSupportedImpl reordering. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../coreml/builders/impl/cast_op_builder.cc | 18 +++-- .../providers/coreml/coreml_basic_test.cc | 69 +++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc index e0665f5c2a5ec..890d2b12db917 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc @@ -77,15 +77,19 @@ Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model bool CastOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const { + if (input_params.create_mlprogram) { + // The ML Program 'cast' op stands alone, so a Cast fed directly by a graph + // input (no preceding node) is fine here. + return true; + } + + // The NeuralNetwork path only supports a Cast that consumes an ArgMax, so it + // needs a preceding node to inspect (InputEdgesBegin() must be dereferenceable). if (node.GetInputEdgesCount() == 0) { LOGS(logger, VERBOSE) << "Cast has no preceding nodes."; return false; } - if (input_params.create_mlprogram) { - return true; - } - const auto& prec_node = node.InputEdgesBegin()->GetNode(); /*Cast node is only aimed for supporting argmax and we are only handling the case where an argmax @@ -135,11 +139,13 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 || input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 || input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT || - input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) && + input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 || + input_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL) && (output_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 || output_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 || output_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT || - output_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)) { + output_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 || + output_type == ONNX_NAMESPACE::TensorProto_DataType_BOOL)) { return true; } else { LOGS(logger, VERBOSE) << "[" << node.OpType() diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index b6e1545d6f319..01e82cf7fdf30 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -1911,6 +1911,75 @@ TEST(CoreMLExecutionProviderTest, Split11SingleOutputNotSupported) { TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None); } +namespace { +// int64 -> Cast(bool) -> Cast(float) round-trip. The bool tensor stays +// internal to the CoreML partition (a partition cannot have bool I/O), and +// the first Cast is fed directly by a graph input -- so this exercises both +// the new bool dtype support and acceptance of a Cast with no preceding node. +std::string MakeCastBoolModelData() { + onnxruntime::Model model("cast_bool_test", false, DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + auto make_type = [](int32_t elem_type) { + ONNX_NAMESPACE::TypeProto t; + t.mutable_tensor_type()->set_elem_type(elem_type); + for (int64_t d : {1, 4}) t.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(d); + return t; + }; + const auto int64_type = make_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + const auto bool_type = make_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL); + const auto float_type = make_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + auto& x = graph.GetOrCreateNodeArg("X", &int64_type); + auto& b = graph.GetOrCreateNodeArg("B", &bool_type); + auto& y = graph.GetOrCreateNodeArg("Y", &float_type); + + auto& to_bool = graph.AddNode("cast_to_bool", "Cast", "int64 -> bool", {&x}, {&b}); + to_bool.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BOOL)); + auto& to_float = graph.AddNode("cast_to_float", "Cast", "bool -> float", {&b}, {&y}); + to_float.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); + + ORT_THROW_IF_ERROR(graph.Resolve()); + std::string model_data; + model.ToProto().SerializeToString(&model_data); + return model_data; +} +} // namespace + +// ML Program Cast supports bool as both a source and a target dtype. +TEST(CoreMLExecutionProviderTest, CastBoolRoundTrip_MLProgram) { + const std::string model_data = MakeCastBoolModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 4}; + std::vector values = {0, 5, 0, -3}; // -> bool {F,T,F,T} -> float {0,1,0,1} + OrtValue x_val; + CreateMLValue(CPUAllocator::DefaultInstance(), dims, values, &x_val); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", x_val)); + + EPVerificationParams params{}; + params.ep_node_assignment = ExpectedEPNodeAssignment::All; + RunAndVerifyOutputsWithEP(model_span, CurrentTestName(), + MakeCoreMLExecutionProvider("MLProgram"), feeds, params); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +// On the NeuralNetwork format the Cast builder only supports a Cast that +// consumes an ArgMax, so these graph-input / Cast-fed Casts must fall back to +// CPU. Guards the IsOpSupportedImpl reordering that moved the preceding-node +// check into the NeuralNetwork branch. +TEST(CoreMLExecutionProviderTest, CastNonArgMaxNeuralNetworkNotSupported) { + const std::string model_data = MakeCastBoolModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); +} + #endif // !(ORT_MINIMAL_BUILD) } // namespace test } // namespace onnxruntime From a1240877ff3dd010c9a8b4dbadf6cafbb857f8fc Mon Sep 17 00:00:00 2001 From: Max Buckley Date: Thu, 21 May 2026 09:34:33 +0200 Subject: [PATCH 2/5] [CoreML EP] Drop the standalone bool-Cast round-trip test CastBoolRoundTrip_MLProgram exercised int64 -> Cast(bool) -> Cast(float). CoreML's compiler fuses the two back-to-back `cast` ops and drops the bool clamp (cast(cast(x,bool),fp32) collapses to cast(x,fp32)), so the round-trip produces the raw input value instead of 0/1 -- the test can't be numerically verified standalone. The bool-Cast support itself is correct: it is exercised end to end by the dependent PRs, where a non-Cast op sits between the int<->bool casts so no fusion occurs -- Cast->And->Cast (Where/And PR) and Cast->GatherND->Cast (GatherND PR), both numerically verified against the CPU EP. CastNonArgMaxNeuralNetworkNotSupported (the NeuralNetwork-format negative test) is kept; it guards the IsOpSupportedImpl reordering. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../providers/coreml/coreml_basic_test.cc | 33 ++++--------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index 01e82cf7fdf30..35c33edb7ece6 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -1912,10 +1912,12 @@ TEST(CoreMLExecutionProviderTest, Split11SingleOutputNotSupported) { } namespace { -// int64 -> Cast(bool) -> Cast(float) round-trip. The bool tensor stays -// internal to the CoreML partition (a partition cannot have bool I/O), and -// the first Cast is fed directly by a graph input -- so this exercises both -// the new bool dtype support and acceptance of a Cast with no preceding node. +// int64 -> Cast(bool) -> Cast(float); the first Cast is fed directly by a +// graph input (no preceding node). Used by the NeuralNetwork negative test +// below. Positive bool-Cast coverage lives in the dependent Where/And and +// GatherND PRs, where a non-Cast op sits between the int<->bool casts -- a +// standalone bool round-trip can't be numerically verified here because +// CoreML fuses back-to-back cast ops (dropping the bool clamp). std::string MakeCastBoolModelData() { onnxruntime::Model model("cast_bool_test", false, DefaultLoggingManager().DefaultLogger()); auto& graph = model.MainGraph(); @@ -1946,29 +1948,6 @@ std::string MakeCastBoolModelData() { } } // namespace -// ML Program Cast supports bool as both a source and a target dtype. -TEST(CoreMLExecutionProviderTest, CastBoolRoundTrip_MLProgram) { - const std::string model_data = MakeCastBoolModelData(); - gsl::span model_span{reinterpret_cast(model_data.data()), - model_data.size()}; - -#if defined(__APPLE__) - std::vector dims = {1, 4}; - std::vector values = {0, 5, 0, -3}; // -> bool {F,T,F,T} -> float {0,1,0,1} - OrtValue x_val; - CreateMLValue(CPUAllocator::DefaultInstance(), dims, values, &x_val); - NameMLValMap feeds; - feeds.insert(std::make_pair("X", x_val)); - - EPVerificationParams params{}; - params.ep_node_assignment = ExpectedEPNodeAssignment::All; - RunAndVerifyOutputsWithEP(model_span, CurrentTestName(), - MakeCoreMLExecutionProvider("MLProgram"), feeds, params); -#else - TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); -#endif -} - // On the NeuralNetwork format the Cast builder only supports a Cast that // consumes an ArgMax, so these graph-input / Cast-fed Casts must fall back to // CPU. Guards the IsOpSupportedImpl reordering that moved the preceding-node From 202825b36fd3ff657f2cd9a69d1302aa570f7f91 Mon Sep 17 00:00:00 2001 From: Max Buckley Date: Wed, 27 May 2026 15:11:50 +0100 Subject: [PATCH 3/5] Add CastBoolMLProgramPartition load-time test yuslepukhin asked on PR #28595 for a positive ML-Program-side test that confirms the partitioner claims the bool Cast nodes, even though we can't numerically verify the round-trip (CoreML fuses back-to-back cast ops and drops the bool clamp, so a value-checking test would silently pass even when the bool dtype is ignored). Adds CastBoolMLProgramPartition, a sibling to CastNonArgMaxNeuralNetworkNotSupported: same MakeCastBoolModelData() graph, but with TestModelLoad + MakeCoreMLExecutionProvider("MLProgram") and ExpectedEPNodeAssignment::All. Together the two tests guard: - HasSupportedInputsImpl now accepts bool (positive), - the "no preceding node" rejection now only applies to NeuralNetwork (negative). Positive numerical coverage continues to live in the dependent #28597 (Where/And) and #28598 (GatherND) PRs, where a non-Cast op sits between the int<->bool casts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test/providers/coreml/coreml_basic_test.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index eca4ece912495..1d8576aa0f7df 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -2408,6 +2408,20 @@ TEST(CoreMLExecutionProviderTest, CastNonArgMaxNeuralNetworkNotSupported) { TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); } +// Load-time partition check on the ML Program path: confirms the EP claims +// both bool Casts (the relaxed "no preceding node" branch + the bool dtype +// gate added in HasSupportedInputsImpl). Numerical verification isn't +// possible here because CoreML fuses back-to-back cast ops and drops the +// bool clamp; the positive numerical coverage lives in the dependent +// Where/And (#28597) and GatherND (#28598) PRs, where a non-Cast op sits +// between the int<->bool casts. +TEST(CoreMLExecutionProviderTest, CastBoolMLProgramPartition) { + const std::string model_data = MakeCastBoolModelData(); + gsl::span model_span{reinterpret_cast(model_data.data()), + model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +} + TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis1) { // ai.onnx:Gather with rank-0 (scalar) 'indices'. ONNX output rank = // data_rank + indices_rank - 1 = 2. The CoreML builder internally promotes From 27383f7a6bfc75931a15703c768f329d6ab7818d Mon Sep 17 00:00:00 2001 From: Max Buckley Date: Fri, 29 May 2026 21:25:16 +0100 Subject: [PATCH 4/5] Fix CastBoolMLProgramPartition: append non-trivial op so partition survives The test built an int64->Cast(bool)->Cast(float) graph and asserted ExpectedEPNodeAssignment::All on the ML Program path, but Cast is marked IsTrivial and GetCapability drops any partition made up entirely of trivial ops. The all-Cast partition was therefore dropped (0 nodes on CoreML), failing the assertion on the arm64 CoreML runner. Append a non-trivial Sqrt to the graph (via a new append_nontrivial flag on MakeCastBoolModelData) so the partition is retained, letting the test assert that both bool Casts are actually claimed by the EP. The NeuralNetwork negative test keeps using the pure all-Cast graph (default flag) and is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../providers/coreml/coreml_basic_test.cc | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index 1d8576aa0f7df..7db419592bf8f 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -2361,13 +2361,20 @@ TEST(CoreMLExecutionProviderTest, Split11SingleOutputNotSupported) { } namespace { -// int64 -> Cast(bool) -> Cast(float); the first Cast is fed directly by a -// graph input (no preceding node). Used by the NeuralNetwork negative test -// below. Positive bool-Cast coverage lives in the dependent Where/And and -// GatherND PRs, where a non-Cast op sits between the int<->bool casts -- a -// standalone bool round-trip can't be numerically verified here because -// CoreML fuses back-to-back cast ops (dropping the bool clamp). -std::string MakeCastBoolModelData() { +// int64 -> Cast(bool) -> Cast(float) [-> Sqrt]; the first Cast is fed directly +// by a graph input (no preceding node). +// +// With append_nontrivial=false this is the all-Cast graph used by the +// NeuralNetwork negative test below. With append_nontrivial=true a non-trivial +// Sqrt is appended so the ML Program partition survives the all-trivial drop in +// CoreMLExecutionProvider::GetCapability (Cast is marked IsTrivial, and a +// partition made up only of trivial ops is dropped because it can't amortise +// the CPU<->CoreML marshalling cost). That lets the partition test below assert +// the bool Casts are actually claimed. A standalone bool round-trip still can't +// be verified numerically here because CoreML fuses back-to-back cast ops +// (dropping the bool clamp); positive numerical coverage lives in the dependent +// Where/And (#28597) and GatherND (#28598) PRs. +std::string MakeCastBoolModelData(bool append_nontrivial = false) { onnxruntime::Model model("cast_bool_test", false, DefaultLoggingManager().DefaultLogger()); auto& graph = model.MainGraph(); @@ -2390,6 +2397,11 @@ std::string MakeCastBoolModelData() { auto& to_float = graph.AddNode("cast_to_float", "Cast", "bool -> float", {&b}, {&y}); to_float.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); + if (append_nontrivial) { + auto& z = graph.GetOrCreateNodeArg("Z", &float_type); + graph.AddNode("sqrt", "Sqrt", "float -> float", {&y}, {&z}); + } + ORT_THROW_IF_ERROR(graph.Resolve()); std::string model_data; model.ToProto().SerializeToString(&model_data); @@ -2410,13 +2422,15 @@ TEST(CoreMLExecutionProviderTest, CastNonArgMaxNeuralNetworkNotSupported) { // Load-time partition check on the ML Program path: confirms the EP claims // both bool Casts (the relaxed "no preceding node" branch + the bool dtype -// gate added in HasSupportedInputsImpl). Numerical verification isn't -// possible here because CoreML fuses back-to-back cast ops and drops the -// bool clamp; the positive numerical coverage lives in the dependent -// Where/And (#28597) and GatherND (#28598) PRs, where a non-Cast op sits -// between the int<->bool casts. +// gate added in HasSupportedInputsImpl). A non-trivial Sqrt is appended so the +// partition isn't dropped as all-trivial (see MakeCastBoolModelData); all three +// nodes -- both Casts and the Sqrt -- must land on CoreML. Numerical +// verification isn't possible here because CoreML fuses back-to-back cast ops +// and drops the bool clamp; the positive numerical coverage lives in the +// dependent Where/And (#28597) and GatherND (#28598) PRs, where a non-Cast op +// sits between the int<->bool casts. TEST(CoreMLExecutionProviderTest, CastBoolMLProgramPartition) { - const std::string model_data = MakeCastBoolModelData(); + const std::string model_data = MakeCastBoolModelData(/*append_nontrivial=*/true); gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); From 0d556b3f362eaff50cb1fe097c26ba4fe44df7ca Mon Sep 17 00:00:00 2001 From: Max Buckley Date: Sat, 30 May 2026 11:10:09 +0100 Subject: [PATCH 5/5] [CoreML EP] Trim cast-bool test comments Drop the cross-PR references and internal-implementation context from the MakeCastBoolModelData / CastBoolMLProgramPartition comments; keep just the self-contained explanation of the append_nontrivial flag and the partition assertion. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../providers/coreml/coreml_basic_test.cc | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index b0fb66129349f..77f43b60dd6f8 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -2364,16 +2364,11 @@ namespace { // int64 -> Cast(bool) -> Cast(float) [-> Sqrt]; the first Cast is fed directly // by a graph input (no preceding node). // -// With append_nontrivial=false this is the all-Cast graph used by the -// NeuralNetwork negative test below. With append_nontrivial=true a non-trivial -// Sqrt is appended so the ML Program partition survives the all-trivial drop in -// CoreMLExecutionProvider::GetCapability (Cast is marked IsTrivial, and a -// partition made up only of trivial ops is dropped because it can't amortise -// the CPU<->CoreML marshalling cost). That lets the partition test below assert -// the bool Casts are actually claimed. A standalone bool round-trip still can't -// be verified numerically here because CoreML fuses back-to-back cast ops -// (dropping the bool clamp); positive numerical coverage lives in the dependent -// Where/And (#28597) and GatherND (#28598) PRs. +// append_nontrivial=false gives the all-Cast graph used by the NeuralNetwork +// negative test below. append_nontrivial=true appends a Sqrt: a CoreML partition +// made up only of trivial ops (Cast is marked trivial) is dropped, so the extra +// non-trivial op keeps the partition and lets the test below assert the bool +// Casts are claimed. std::string MakeCastBoolModelData(bool append_nontrivial = false) { onnxruntime::Model model("cast_bool_test", false, DefaultLoggingManager().DefaultLogger()); auto& graph = model.MainGraph(); @@ -2444,15 +2439,10 @@ TEST(CoreMLExecutionProviderTest, CastNonArgMaxNeuralNetworkNotSupported) { TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); } -// Load-time partition check on the ML Program path: confirms the EP claims -// both bool Casts (the relaxed "no preceding node" branch + the bool dtype -// gate added in HasSupportedInputsImpl). A non-trivial Sqrt is appended so the -// partition isn't dropped as all-trivial (see MakeCastBoolModelData); all three -// nodes -- both Casts and the Sqrt -- must land on CoreML. Numerical -// verification isn't possible here because CoreML fuses back-to-back cast ops -// and drops the bool clamp; the positive numerical coverage lives in the -// dependent Where/And (#28597) and GatherND (#28598) PRs, where a non-Cast op -// sits between the int<->bool casts. +// Load-time partition check on the ML Program path: confirms the EP claims both +// bool Casts. A non-trivial Sqrt is appended so the partition isn't dropped as +// all-trivial (see MakeCastBoolModelData); all three nodes -- both Casts and the +// Sqrt -- must land on CoreML. TEST(CoreMLExecutionProviderTest, CastBoolMLProgramPartition) { const std::string model_data = MakeCastBoolModelData(/*append_nontrivial=*/true); gsl::span model_span{reinterpret_cast(model_data.data()),