diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc index e0665f5c2a5ec..61b18859ba685 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc @@ -22,6 +22,12 @@ class CastOpBuilder : public BaseOpBuilder { public: bool SupportsMLProgram() const override { return true; } + + // Cast is shape-only data movement from CoreML's perspective: per-element + // dtype conversion that the marshalling overhead dominates for small + // tensors. CoreML claims it but a partition consisting only of Casts + // doesn't earn its own marshalling cost. + bool IsTrivial(const Node& /*node*/) const override { return true; } }; Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, diff --git a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc index f0adb70587bcf..79589cd578ad1 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc @@ -17,6 +17,8 @@ class FlattenOpBuilder : public BaseOpBuilder { bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const override; + + bool IsTrivial(const Node& /*node*/) const override { return true; } }; Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, diff --git a/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc new file mode 100644 index 0000000000000..57b15404d464b --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" + +namespace onnxruntime { +namespace coreml { + +class IdentityOpBuilder : public BaseOpBuilder { + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } + + bool IsTrivial(const Node& /*node*/) const override { return true; } +}; + +Status IdentityOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& /*logger*/) const { + const auto& input_defs = node.InputDefs(); + const auto& output_def = *node.OutputDefs()[0]; + + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; + auto op = model_builder.CreateOperation(node, "identity"); + AddOperationInput(*op, "x", input_defs[0]->Name()); + AddOperationOutput(*op, output_def); + model_builder.AddOperation(std::move(op)); + } else { + // NeuralNetwork: emulate via activation LINEAR(alpha=1, beta=0). + auto layer = model_builder.CreateNNLayer(node); + auto* linear = layer->mutable_activation()->mutable_linear(); + linear->set_alpha(1.0f); + linear->set_beta(0.0f); + *layer->mutable_input()->Add() = input_defs[0]->Name(); + *layer->mutable_output()->Add() = output_def.Name(); + model_builder.AddLayer(std::move(layer)); + } + return Status::OK(); +} + +void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc index e3781ed7d388b..a926db94d1dc0 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc @@ -27,6 +27,8 @@ class ReshapeOpBuilder : public BaseOpBuilder { int GetMinSupportedOpSet(const Node& /* node */) const override { return 5; } bool SupportsMLProgram() const override { return true; } + + bool IsTrivial(const Node& /*node*/) const override { return true; } }; void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const { diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc index 92f0f2bb5fc3d..913639b30b409 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc @@ -29,6 +29,9 @@ class SqueezeOpBuilder : public BaseOpBuilder { bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const override; bool SupportsMLProgram() const override { return true; } + + // SqueezeOpBuilder handles both Squeeze and Unsqueeze; both are shape-only. + bool IsTrivial(const Node& /*node*/) const override { return true; } }; namespace { diff --git a/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc new file mode 100644 index 0000000000000..40e69dd20cb82 --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc @@ -0,0 +1,156 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/optimizer/initializer.h" +#include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" +#include "core/providers/coreml/shape_utils.h" +#include "core/providers/shared/utils/utils.h" + +namespace onnxruntime { +namespace coreml { + +class TileOpBuilder : public BaseOpBuilder { + void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override; + + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } + + bool IsTrivial(const Node& /*node*/) const override { return true; } +}; + +void TileOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const { + // If 'repeats' is a constant initializer we bake it into the MIL constant + // and don't need the original to land in the model. If it's a runtime + // tensor the dynamic-shape MIL path consumes it directly. + if (model_builder.GetConstantInitializer(node.InputDefs()[1]->Name())) { + model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name()); + } +} + +Status TileOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& /*logger*/) const { + const auto& input_defs = node.InputDefs(); + const auto& output_def = *node.OutputDefs()[0]; + const auto* repeats_init = model_builder.GetConstantInitializer(input_defs[1]->Name()); + + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; + auto op = model_builder.CreateOperation(node, "tile"); + AddOperationInput(*op, "x", input_defs[0]->Name()); + if (repeats_init) { + Initializer unpacked(model_builder.GetGraphViewer().GetGraph(), *repeats_init); + auto repeats = unpacked.DataAsSpan(); + AddOperationInput(*op, "reps", model_builder.AddConstant(op->type(), "reps", repeats)); + } else { + // Runtime 'reps' (e.g. emitted by a Loop). Pass the tensor through. + AddOperationInput(*op, "reps", input_defs[1]->Name()); + } + AddOperationOutput(*op, output_def); + model_builder.AddOperation(std::move(op)); + } else { + if (!repeats_init) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "TileOpBuilder NeuralNetwork path requires constant 'repeats'"); + } + Initializer unpacked(model_builder.GetGraphViewer().GetGraph(), *repeats_init); + auto repeats = unpacked.DataAsSpan(); + auto layer = model_builder.CreateNNLayer(node); + auto* tile_params = layer->mutable_tile(); + for (int64_t r : repeats) { + tile_params->add_reps(r); + } + *layer->mutable_input()->Add() = input_defs[0]->Name(); + *layer->mutable_output()->Add() = output_def.Name(); + model_builder.AddLayer(std::move(layer)); + } + return Status::OK(); +} + +bool TileOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + // Tile is shape-only data movement, so it can carry any element type CoreML + // can represent. ONNX Tile is commonly used in graph post-processing on + // INT32 grid-index tensors (e.g. YOLO anchor expansion), which the default + // base check (float-only) would reject. + int32_t input_type; + if (!GetType(*node.InputDefs()[0], input_type, logger)) { + return false; + } + switch (input_type) { + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: + case ONNX_NAMESPACE::TensorProto_DataType_INT32: + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + case ONNX_NAMESPACE::TensorProto_DataType_BOOL: + return true; + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: + if (input_params.create_mlprogram && input_params.coreml_version >= 6) { + return true; + } + [[fallthrough]]; + default: + LOGS(logger, VERBOSE) << "[Tile] input type " << input_type << " is not supported"; + return false; + } +} + +bool TileOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + const auto& input_defs = node.InputDefs(); + + // The NeuralNetwork emitter only supports constant 'repeats'; the MLProgram + // path also accepts a runtime 'reps' tensor. + const auto& repeats_name = input_defs[1]->Name(); + const auto* repeats_tensor = input_params.graph_viewer.GetConstantInitializer(repeats_name); + if (!input_params.create_mlprogram && !repeats_tensor) { + LOGS(logger, VERBOSE) << "Tile NeuralNetwork path requires 'repeats' to be a constant initializer"; + return false; + } + + std::vector input_shape; + if (!GetShape(*input_defs[0], input_shape, logger)) { + return false; + } + + if (input_shape.size() > 5) { + LOGS(logger, VERBOSE) << "Tile does not support input rank greater than 5. Input rank: " << input_shape.size(); + return false; + } + + if (repeats_tensor) { + Initializer unpacked(input_params.graph_viewer.GetGraph(), *repeats_tensor); + auto repeats = unpacked.DataAsSpan(); + if (repeats.size() != input_shape.size()) { + LOGS(logger, VERBOSE) << "Tile 'repeats' length (" << repeats.size() + << ") must match input rank (" << input_shape.size() << ")"; + return false; + } + for (int64_t r : repeats) { + if (r < 1) { + LOGS(logger, VERBOSE) << "Tile 'repeats' values must be positive; got " << r; + return false; + } + } + } + + return true; +} + +void CreateTileOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc index 5bb7e4c11967a..51cc1a443a4c0 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc @@ -17,6 +17,8 @@ class TransposeOpBuilder : public BaseOpBuilder { const logging::Logger& logger) const override; bool SupportsMLProgram() const override { return true; } + + bool IsTrivial(const Node& /*node*/) const override { return true; } }; Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc index 09ce25fd29778..0ae2a18e94e29 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc @@ -18,6 +18,11 @@ class UnaryOpBuilder : public BaseOpBuilder { bool SupportsMLProgram() const override { return true; } bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const override; + + // Of the unary ops this builder handles, only Ceil is cheap enough to count + // as trivial. Erf/Round/Exp/Reciprocal/Sqrt are all transcendental or + // multi-cycle ops and earn their own marshalling cost. + bool IsTrivial(const Node& node) const override { return node.OpType() == "Ceil"; } }; Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, @@ -39,6 +44,8 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const coreml_op_type = "round"; } else if (op_type == "Exp") { coreml_op_type = "exp"; + } else if (op_type == "Ceil") { + coreml_op_type = "ceil"; } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "UnaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type); @@ -82,7 +89,8 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const bool UnaryOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& /*logger*/) const { if (!input_params.create_mlprogram) { - if (node.OpType() == "Erf" || node.OpType() == "Round" || node.OpType() == "Exp") { + if (node.OpType() == "Erf" || node.OpType() == "Round" || node.OpType() == "Exp" || + node.OpType() == "Ceil") { return false; } } diff --git a/onnxruntime/core/providers/coreml/builders/op_builder.h b/onnxruntime/core/providers/coreml/builders/op_builder.h index 0bb7f280c33e6..3e8a854bcdec7 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder.h @@ -44,6 +44,14 @@ class IOpBuilder { // Does the builder implementation support creating an ML Program? virtual bool SupportsMLProgram() const = 0; + + // Is this op cheap enough that a CoreML partition consisting only of nodes + // like it isn't worth the marshalling cost? Used by the trivial-only + // partition heuristic in CoreMLExecutionProvider::GetCapability. Defaults + // to false; trivial-op builders override to true. Some builders dispatch + // multiple op types (e.g. UnaryOpBuilder), so the answer can depend on + // node.OpType(). + virtual bool IsTrivial(const Node& /*node*/) const { return false; } }; } // namespace coreml diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index 6f465774a3c3c..fd0e19dbd055a 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -38,6 +38,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateUnaryOpBuilder("Round", op_registrations); CreateUnaryOpBuilder("Sqrt", op_registrations); CreateUnaryOpBuilder("Exp", op_registrations); + CreateUnaryOpBuilder("Ceil", op_registrations); // Binary elementwise ops CreateBinaryOpBuilder("Add", op_registrations); @@ -77,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateGatherOpBuilder("Gather", op_registrations); CreateGemmOpBuilder("Gemm", op_registrations); CreateGridSampleOpBuilder("GridSample", op_registrations); + CreateIdentityOpBuilder("Identity", op_registrations); CreateLRNOpBuilder("LRN", op_registrations); CreateGemmOpBuilder("MatMul", op_registrations); CreatePadOpBuilder("Pad", op_registrations); @@ -87,6 +89,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateSplitOpBuilder("Split", op_registrations); CreateSoftmaxOpBuilder("Softmax", op_registrations); CreateSqueezeOpBuilder("Squeeze", op_registrations); + CreateTileOpBuilder("Tile", op_registrations); CreateTransposeOpBuilder("Transpose", op_registrations); CreateSqueezeOpBuilder("Unsqueeze", op_registrations); diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h index f6304848274de..d399a4f91576e 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h @@ -31,6 +31,7 @@ void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); @@ -42,6 +43,7 @@ void CreateSliceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op void CreateSoftmaxOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateSqueezeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateTileOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc index cc7beed6bb298..9dd3dfaf6c75a 100644 --- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc +++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc @@ -11,6 +11,7 @@ #include "core/framework/tensorprotoutils.h" #include "core/graph/graph_viewer.h" #include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/op_builder_factory.h" #include "core/providers/partitioning_utils.h" #include "core/session/onnxruntime_cxx_api.h" @@ -88,9 +89,32 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie return MakeString(user_provided_key, "_", COREML, "_", model_hash, "_", metadef_id); }; - result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {}, + // Drop CoreML partitions that consist entirely of trivial shape / cheap-elementwise ops. + // These ops can each be claimed individually but the CPU↔CoreML round-trip cost + // (~50-100us marshalling) outweighs the saving when the partition has no compute-heavy + // op to amortise it over. Per-op CoreML dispatch cost is ~10-14us on M3 Max even for + // trivial ops (Identity/Ceil/Tile etc.), and CPU runs them in <1us each. + // + // The "trivial" marker lives on each op builder's IOpBuilder::IsTrivial(node) + // override rather than as a hardcoded set here, so adding a new trivial op + // builder doesn't risk drifting from a list maintained at the EP level. + const auto& op_builders = coreml::GetOpBuilders(); + const auto is_node_trivial = [&](const Node* node) -> bool { + auto it = op_builders.find(node->OpType()); + return it != op_builders.end() && it->second->IsTrivial(*node); + }; + const auto is_node_supported = [&](const Node& node) -> bool { + return supported_nodes.find(&node) != supported_nodes.end(); + }; + const auto on_group_closed = [&](const std::vector& group) -> bool { + // Keep the partition only if at least one node is non-trivial. + return std::any_of(group.begin(), group.end(), + [&](const Node* node) { return !is_node_trivial(node); }); + }; + + result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed, gen_metadef_name, COREML, kCoreMLExecutionProvider, - nullptr, + /*node_unit_map*/ nullptr, /*drop_constant_initializers*/ true); const auto num_of_partitions = result.size(); diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index b6e1545d6f319..bdfa3eeb8657b 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -12,6 +12,7 @@ #include "core/graph/constants.h" #include "core/graph/graph.h" #include "core/graph/graph_viewer.h" +#include "core/optimizer/graph_transformer_level.h" #include "core/providers/coreml/coreml_provider_factory_creator.h" #include "core/providers/coreml/coreml_provider_factory.h" #include "core/session/inference_session.h" @@ -1164,6 +1165,442 @@ TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) { #endif } +// Build a model: input -> Conv -> -> output. The Conv anchors +// the partition so the trivial-partition heuristic keeps it; the chained ops +// land inside a single CoreML partition rather than fragmenting it. +namespace { +ONNX_NAMESPACE::ModelProto MakeConvWithTrivialChainModel( + const std::string& trivial_op, + bool tile_with_repeats /*for Tile only*/) { + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_domain(""); + opset->set_version(13); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("conv_chain_test"); + + auto add_value = [&](auto* proto, const char* name, const std::vector& shape) { + proto->set_name(name); + auto* tt = proto->mutable_type()->mutable_tensor_type(); + tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d); + }; + add_value(graph_proto->add_input(), "X", {1, 2, 4, 4}); + add_value(graph_proto->add_output(), "Y", {1, 3, 3, 3}); + + // Conv weight initialiser + auto* w = graph_proto->add_initializer(); + w->set_name("W"); + w->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : {3, 2, 2, 2}) w->add_dims(d); + for (int i = 0; i < 24; ++i) w->add_float_data(0.05f * i - 0.4f); + + auto* conv = graph_proto->add_node(); + conv->set_op_type("Conv"); + conv->add_input("X"); + conv->add_input("W"); + conv->add_output("conv_out"); + auto* pads = conv->add_attribute(); + pads->set_name("pads"); + pads->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS); + for (int64_t v : {0, 0, 0, 0}) pads->add_ints(v); + + if (trivial_op == "Tile") { + auto* reps_init = graph_proto->add_initializer(); + reps_init->set_name("reps"); + reps_init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + reps_init->add_dims(4); + for (int64_t v : {1, 1, 1, 1}) reps_init->add_int64_data(v); + auto* node = graph_proto->add_node(); + node->set_op_type("Tile"); + node->add_input("conv_out"); + node->add_input("reps"); + node->add_output("Y"); + (void)tile_with_repeats; + } else { + auto* node = graph_proto->add_node(); + node->set_op_type(trivial_op); + node->add_input("conv_out"); + node->add_output("Y"); + } + return model_proto; +} + +void RunConvChainTest(const std::string& trivial_op, std::string_view log_id) { + auto model_proto = MakeConvWithTrivialChainModel(trivial_op, false); + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 2, 4, 4}; + std::vector x_data(32); + for (size_t i = 0; i < x_data.size(); ++i) x_data[i] = static_cast(i) * 0.1f - 1.5f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, x_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + RunAndVerifyOutputsWithEP(model_span, std::string(log_id), + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} +} // namespace + +TEST(CoreMLExecutionProviderTest, IdentityWithConvAnchor) { + // Conv → Identity → output. Conv anchors the partition; Identity must be + // claimed (the trivial-partition heuristic keeps it because Conv is present). + RunConvChainTest("Identity", "IdentityWithConvAnchor_MLProgram"); +} + +TEST(CoreMLExecutionProviderTest, CeilWithConvAnchor) { + // Conv → Ceil → output. Same rationale; Ceil is also a unary MIL op. + RunConvChainTest("Ceil", "CeilWithConvAnchor_MLProgram"); +} + +TEST(CoreMLExecutionProviderTest, TileWithConvAnchor) { + // Conv → Tile(reps=[1,1,1,1]) → output. Validates the Tile builder claims + // the node alongside the Conv anchor. + RunConvChainTest("Tile", "TileWithConvAnchor_MLProgram"); +} + +// Helper for trivial-only chain tests. Builds a model with input X[dims] and +// output Y[dims], populates the graph body via `populate_chain`, and asserts +// the CoreML EP claims none of it. Graph optimisations are pinned to Default +// so passes like IdentityElimination / CastElimination do not pre-empt the +// trivial-partition heuristic in CoreMLExecutionProvider::GetCapability. +namespace { +void RunTrivialOnlyChainTest( + std::string_view log_id, + const std::vector& dims, + const std::vector& x_data, + const std::function& populate_chain) { + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_domain(""); + opset->set_version(13); + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("trivial_only"); + + auto add_value = [&](auto* proto, const char* name, const std::vector& shape) { + proto->set_name(name); + auto* tt = proto->mutable_type()->mutable_tensor_type(); + tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d); + }; + add_value(graph_proto->add_input(), "X", dims); + add_value(graph_proto->add_output(), "Y", dims); + + populate_chain(graph_proto); + + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, x_data, &ml_value_x); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + auto disable_optimizations = [](SessionOptions& so) { + so.graph_optimization_level = TransformerLevel::Default; + }; + + RunAndVerifyOutputsWithEP(model_span, std::string(log_id), + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::None}, + disable_optimizations); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None); +#endif +} +} // namespace + +TEST(CoreMLExecutionProviderTest, TrivialOnlyChainIsNotClaimedByCoreML) { + // 3 chained Identity nodes with no compute-heavy anchor → heuristic drops the + // partition so CPU runs it. Round-trip cost would exceed the saving otherwise. + RunTrivialOnlyChainTest( + "TrivialOnlyChainIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto* n1 = graph->add_node(); + n1->set_op_type("Identity"); + n1->add_input("X"); + n1->add_output("a"); + auto* n2 = graph->add_node(); + n2->set_op_type("Identity"); + n2->add_input("a"); + n2->add_output("b"); + auto* n3 = graph->add_node(); + n3->set_op_type("Identity"); + n3->add_input("b"); + n3->add_output("Y"); + }); +} + +TEST(CoreMLExecutionProviderTest, ReshapeOnlyChainIsNotClaimedByCoreML) { + RunTrivialOnlyChainTest( + "ReshapeOnlyChainIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto add_shape_init = [&](const char* name, const std::vector& shape) { + auto* init = graph->add_initializer(); + init->set_name(name); + init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + init->add_dims(static_cast(shape.size())); + for (int64_t v : shape) init->add_int64_data(v); + }; + add_shape_init("shape_a", {2, 4}); + add_shape_init("shape_b", {1, 8}); + + auto* n1 = graph->add_node(); + n1->set_op_type("Reshape"); + n1->add_input("X"); + n1->add_input("shape_a"); + n1->add_output("a"); + auto* n2 = graph->add_node(); + n2->set_op_type("Reshape"); + n2->add_input("a"); + n2->add_input("shape_b"); + n2->add_output("Y"); + }); +} + +TEST(CoreMLExecutionProviderTest, TransposeOnlyChainIsNotClaimedByCoreML) { + RunTrivialOnlyChainTest( + "TransposeOnlyChainIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto add_transpose = [&](const char* name, const char* in, const char* out, + const std::vector& perm) { + auto* node = graph->add_node(); + node->set_name(name); + node->set_op_type("Transpose"); + node->add_input(in); + node->add_output(out); + auto* attr = node->add_attribute(); + attr->set_name("perm"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS); + for (int64_t v : perm) attr->add_ints(v); + }; + // Two Transposes that compose back to the identity perm. + add_transpose("t0", "X", "a", {1, 0}); + add_transpose("t1", "a", "Y", {1, 0}); + }); +} + +TEST(CoreMLExecutionProviderTest, TileOnlyIsNotClaimedByCoreML) { + // Single Tile with reps=[1,1] — pure data movement, no compute anchor. + RunTrivialOnlyChainTest( + "TileOnlyIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto* reps = graph->add_initializer(); + reps->set_name("reps"); + reps->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + reps->add_dims(2); + reps->add_int64_data(1); + reps->add_int64_data(1); + auto* n = graph->add_node(); + n->set_op_type("Tile"); + n->add_input("X"); + n->add_input("reps"); + n->add_output("Y"); + }); +} + +TEST(CoreMLExecutionProviderTest, CeilOnlyIsNotClaimedByCoreML) { + // Single Ceil — supported by the new Unary builder but trivial; heuristic drops it. + RunTrivialOnlyChainTest( + "CeilOnlyIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.1f, 0.6f, 1.4f, 1.9f, -0.6f, -1.4f, 2.5f, 3.1f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto* n = graph->add_node(); + n->set_op_type("Ceil"); + n->add_input("X"); + n->add_output("Y"); + }); +} + +TEST(CoreMLExecutionProviderTest, MixedTrivialChainIsNotClaimedByCoreML) { + // Identity → Cast(float→float) → Reshape → Transpose. Different trivial ops in + // sequence; with no compute-heavy anchor the heuristic drops the whole partition. + RunTrivialOnlyChainTest( + "MixedTrivialChainIsNotClaimedByCoreML_MLProgram", + {1, 8}, + {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + [](ONNX_NAMESPACE::GraphProto* graph) { + auto* shape_init = graph->add_initializer(); + shape_init->set_name("reshape_shape"); + shape_init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + shape_init->add_dims(2); + shape_init->add_int64_data(8); + shape_init->add_int64_data(1); + + auto* identity = graph->add_node(); + identity->set_op_type("Identity"); + identity->add_input("X"); + identity->add_output("a"); + + auto* cast = graph->add_node(); + cast->set_op_type("Cast"); + cast->add_input("a"); + cast->add_output("b"); + auto* to_attr = cast->add_attribute(); + to_attr->set_name("to"); + to_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + to_attr->set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + auto* reshape = graph->add_node(); + reshape->set_op_type("Reshape"); + reshape->add_input("b"); + reshape->add_input("reshape_shape"); + reshape->add_output("c"); + + auto* transpose = graph->add_node(); + transpose->set_op_type("Transpose"); + transpose->add_input("c"); + transpose->add_output("Y"); + auto* perm_attr = transpose->add_attribute(); + perm_attr->set_name("perm"); + perm_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS); + perm_attr->add_ints(1); + perm_attr->add_ints(0); + }); +} + +TEST(CoreMLExecutionProviderTest, ConvTrivialChainConvKeepsAllOnCoreML) { + // Sandwich test: Conv → Identity → Cast → Reshape → Conv. The two Convs + // make the partition non-trivial, so the heuristic keeps the trivial ops in + // the same partition rather than splitting them off to CPU. Verifies the + // "stay on GPU for GPU chains" half of the heuristic. + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_domain(""); + opset->set_version(13); + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("conv_trivial_conv_sandwich"); + + auto add_value = [&](auto* proto, const char* name, const std::vector& shape) { + proto->set_name(name); + auto* tt = proto->mutable_type()->mutable_tensor_type(); + tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d); + }; + add_value(graph_proto->add_input(), "X", {1, 2, 4, 4}); + add_value(graph_proto->add_output(), "Y", {1, 2, 3, 3}); + + // Conv1: weight [3, 2, 2, 2], output [1, 3, 3, 3] + auto* w1 = graph_proto->add_initializer(); + w1->set_name("W1"); + w1->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : {3, 2, 2, 2}) w1->add_dims(d); + for (int i = 0; i < 24; ++i) w1->add_float_data(0.05f * i - 0.4f); + + // Conv2: weight [2, 3, 1, 1], output [1, 2, 3, 3] + auto* w2 = graph_proto->add_initializer(); + w2->set_name("W2"); + w2->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + for (int64_t d : {2, 3, 1, 1}) w2->add_dims(d); + for (int i = 0; i < 6; ++i) w2->add_float_data(0.1f * i - 0.25f); + + // Reshape shape initializer (no-op reshape: [1,3,3,3] → [1,3,3,3]) + auto* reshape_shape = graph_proto->add_initializer(); + reshape_shape->set_name("reshape_shape"); + reshape_shape->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + reshape_shape->add_dims(4); + for (int64_t v : {1, 3, 3, 3}) reshape_shape->add_int64_data(v); + + auto add_pads_attr = [](ONNX_NAMESPACE::NodeProto* node) { + auto* pads = node->add_attribute(); + pads->set_name("pads"); + pads->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS); + for (int64_t v : {0, 0, 0, 0}) pads->add_ints(v); + }; + + auto* conv1 = graph_proto->add_node(); + conv1->set_op_type("Conv"); + conv1->add_input("X"); + conv1->add_input("W1"); + conv1->add_output("conv1_out"); + add_pads_attr(conv1); + + auto* identity = graph_proto->add_node(); + identity->set_op_type("Identity"); + identity->add_input("conv1_out"); + identity->add_output("ident_out"); + + auto* cast = graph_proto->add_node(); + cast->set_op_type("Cast"); + cast->add_input("ident_out"); + cast->add_output("cast_out"); + auto* to_attr = cast->add_attribute(); + to_attr->set_name("to"); + to_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + to_attr->set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + auto* reshape = graph_proto->add_node(); + reshape->set_op_type("Reshape"); + reshape->add_input("cast_out"); + reshape->add_input("reshape_shape"); + reshape->add_output("reshape_out"); + + auto* conv2 = graph_proto->add_node(); + conv2->set_op_type("Conv"); + conv2->add_input("reshape_out"); + conv2->add_input("W2"); + conv2->add_output("Y"); + add_pads_attr(conv2); + + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 2, 4, 4}; + std::vector x_data(32); + for (size_t i = 0; i < x_data.size(); ++i) x_data[i] = static_cast(i) * 0.1f - 1.5f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, x_data, &ml_value_x); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + // Disable optimisations so the trivial ops survive into partitioning and we + // actually verify the heuristic (otherwise IdentityElimination / similar + // passes could remove them before CoreML's GetCapability runs). + auto disable_optimizations = [](SessionOptions& so) { + so.graph_optimization_level = TransformerLevel::Default; + }; + + RunAndVerifyOutputsWithEP(model_span, "ConvTrivialChainConvKeepsAllOnCoreML_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}, + disable_optimizations); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + namespace { // Build a single-node com.microsoft:FusedConv model for the tests below. // Input X is {1, 2, 4, 4}, weight W is {3, 2, 2, 2} (constant initializer, set diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index 395813844906a..106280d258ecb 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -12,6 +12,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Conv|Only 1D/2D Conv is supported.
Bias if provided must be constant.| |ai.onnx:ConvTranspose|Weight and bias must be constant.
padding_type of SAME_UPPER/SAME_LOWER is not supported.
kernel_shape must have default values.
output_shape is not supported.
output_padding must have default values.| |ai.onnx:DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.| +|ai.onnx:Ceil|| |ai.onnx:Div|| |ai.onnx:Elu|| |ai.onnx:Erf|| @@ -23,6 +24,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:GridSample|4D input.
'mode' of 'linear' or 'zeros'.
(mode==linear && padding_mode==reflection && align_corners==0) is not supported.| |ai.onnx:GroupNormalization|| |ai.onnx:HardSigmoid|| +|ai.onnx:Identity|| |ai.onnx:InstanceNormalization|| |ai.onnx:LayerNormalization|| |ai.onnx:LeakyRelu|| @@ -50,6 +52,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Sqrt|| |ai.onnx:Squeeze|| |ai.onnx:Tanh|| +|ai.onnx:Tile|`repeats` may be a constant initializer or a runtime tensor (MLProgram only). Input rank up to 5.| |ai.onnx:Transpose|| |ai.onnx:Unsqueeze|| |com.microsoft:QuickGelu|Produced by ORT's `QuickGeluFusion` optimizer pass. Decomposed into `mul` / `sigmoid` / `mul`.|