diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index e0665f5c2a5ec..61b18859ba685 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -22,6 +22,12 @@ class CastOpBuilder : public BaseOpBuilder {
 
  public:
   bool SupportsMLProgram() const override { return true; }
+
+  // Cast is shape-only data movement from CoreML's perspective: per-element
+  // dtype conversion that the marshalling overhead dominates for small
+  // tensors. CoreML claims it but a partition consisting only of Casts
+  // doesn't earn its own marshalling cost.
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
 };
 
 Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
index f0adb70587bcf..79589cd578ad1 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
@@ -17,6 +17,8 @@ class FlattenOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
 };
 
 Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc
new file mode 100644
index 0000000000000..57b15404d464b
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/identity_op_builder.cc
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+class IdentityOpBuilder : public BaseOpBuilder {
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
+};
+
+Status IdentityOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                                                const logging::Logger& /*logger*/) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& output_def = *node.OutputDefs()[0];
+
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    auto op = model_builder.CreateOperation(node, "identity");
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationOutput(*op, output_def);
+    model_builder.AddOperation(std::move(op));
+  } else {
+    // NeuralNetwork: emulate via activation LINEAR(alpha=1, beta=0).
+    auto layer = model_builder.CreateNNLayer(node);
+    auto* linear = layer->mutable_activation()->mutable_linear();
+    linear->set_alpha(1.0f);
+    linear->set_beta(0.0f);
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_def.Name();
+    model_builder.AddLayer(std::move(layer));
+  }
+  return Status::OK();
+}
+
+void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<IdentityOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index e3781ed7d388b..a926db94d1dc0 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -27,6 +27,8 @@ class ReshapeOpBuilder : public BaseOpBuilder {
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 5; }
 
   bool SupportsMLProgram() const override { return true; }
+
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
 };
 
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 92f0f2bb5fc3d..913639b30b409 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -29,6 +29,9 @@ class SqueezeOpBuilder : public BaseOpBuilder {
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
   bool SupportsMLProgram() const override { return true; }
+
+  // SqueezeOpBuilder handles both Squeeze and Unsqueeze; both are shape-only.
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
 };
 
 namespace {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc
new file mode 100644
index 0000000000000..40e69dd20cb82
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/tile_op_builder.cc
@@ -0,0 +1,156 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/initializer.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+class TileOpBuilder : public BaseOpBuilder {
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
+
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
+};
+
+void TileOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // If 'repeats' is a constant initializer we bake it into the MIL constant
+  // and don't need the original to land in the model. If it's a runtime
+  // tensor the dynamic-shape MIL path consumes it directly.
+  if (model_builder.GetConstantInitializer(node.InputDefs()[1]->Name())) {
+    model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+  }
+}
+
+Status TileOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                                            const logging::Logger& /*logger*/) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& output_def = *node.OutputDefs()[0];
+  const auto* repeats_init = model_builder.GetConstantInitializer(input_defs[1]->Name());
+
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    auto op = model_builder.CreateOperation(node, "tile");
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (repeats_init) {
+      Initializer unpacked(model_builder.GetGraphViewer().GetGraph(), *repeats_init);
+      auto repeats = unpacked.DataAsSpan<int64_t>();
+      AddOperationInput(*op, "reps", model_builder.AddConstant(op->type(), "reps", repeats));
+    } else {
+      // Runtime 'reps' (e.g. emitted by a Loop). Pass the tensor through.
+      AddOperationInput(*op, "reps", input_defs[1]->Name());
+    }
+    AddOperationOutput(*op, output_def);
+    model_builder.AddOperation(std::move(op));
+  } else {
+    if (!repeats_init) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "TileOpBuilder NeuralNetwork path requires constant 'repeats'");
+    }
+    Initializer unpacked(model_builder.GetGraphViewer().GetGraph(), *repeats_init);
+    auto repeats = unpacked.DataAsSpan<int64_t>();
+    auto layer = model_builder.CreateNNLayer(node);
+    auto* tile_params = layer->mutable_tile();
+    for (int64_t r : repeats) {
+      tile_params->add_reps(r);
+    }
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_def.Name();
+    model_builder.AddLayer(std::move(layer));
+  }
+  return Status::OK();
+}
+
+bool TileOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                           const logging::Logger& logger) const {
+  // Tile is shape-only data movement, so it can carry any element type CoreML
+  // can represent. ONNX Tile is commonly used in graph post-processing on
+  // INT32 grid-index tensors (e.g. YOLO anchor expansion), which the default
+  // base check (float-only) would reject.
+  int32_t input_type;
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
+    return false;
+  }
+  switch (input_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return true;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      if (input_params.create_mlprogram && input_params.coreml_version >= 6) {
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      LOGS(logger, VERBOSE) << "[Tile] input type " << input_type << " is not supported";
+      return false;
+  }
+}
+
+bool TileOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                      const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+
+  // The NeuralNetwork emitter only supports constant 'repeats'; the MLProgram
+  // path also accepts a runtime 'reps' tensor.
+  const auto& repeats_name = input_defs[1]->Name();
+  const auto* repeats_tensor = input_params.graph_viewer.GetConstantInitializer(repeats_name);
+  if (!input_params.create_mlprogram && !repeats_tensor) {
+    LOGS(logger, VERBOSE) << "Tile NeuralNetwork path requires 'repeats' to be a constant initializer";
+    return false;
+  }
+
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    return false;
+  }
+
+  if (input_shape.size() > 5) {
+    LOGS(logger, VERBOSE) << "Tile does not support input rank greater than 5. Input rank: " << input_shape.size();
+    return false;
+  }
+
+  if (repeats_tensor) {
+    Initializer unpacked(input_params.graph_viewer.GetGraph(), *repeats_tensor);
+    auto repeats = unpacked.DataAsSpan<int64_t>();
+    if (repeats.size() != input_shape.size()) {
+      LOGS(logger, VERBOSE) << "Tile 'repeats' length (" << repeats.size()
+                            << ") must match input rank (" << input_shape.size() << ")";
+      return false;
+    }
+    for (int64_t r : repeats) {
+      if (r < 1) {
+        LOGS(logger, VERBOSE) << "Tile 'repeats' values must be positive; got " << r;
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void CreateTileOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<TileOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 5bb7e4c11967a..51cc1a443a4c0 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -17,6 +17,8 @@ class TransposeOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override;
 
   bool SupportsMLProgram() const override { return true; }
+
+  bool IsTrivial(const Node& /*node*/) const override { return true; }
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index 09ce25fd29778..0ae2a18e94e29 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -18,6 +18,11 @@ class UnaryOpBuilder : public BaseOpBuilder {
   bool SupportsMLProgram() const override { return true; }
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  // Of the unary ops this builder handles, only Ceil is cheap enough to count
+  // as trivial. Erf/Round/Exp/Reciprocal/Sqrt are all transcendental or
+  // multi-cycle ops and earn their own marshalling cost.
+  bool IsTrivial(const Node& node) const override { return node.OpType() == "Ceil"; }
 };
 
 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@@ -39,6 +44,8 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       coreml_op_type = "round";
     } else if (op_type == "Exp") {
       coreml_op_type = "exp";
+    } else if (op_type == "Ceil") {
+      coreml_op_type = "ceil";
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "UnaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
@@ -82,7 +89,8 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 bool UnaryOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& /*logger*/) const {
   if (!input_params.create_mlprogram) {
-    if (node.OpType() == "Erf" || node.OpType() == "Round" || node.OpType() == "Exp") {
+    if (node.OpType() == "Erf" || node.OpType() == "Round" || node.OpType() == "Exp" ||
+        node.OpType() == "Ceil") {
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder.h b/onnxruntime/core/providers/coreml/builders/op_builder.h
index 0bb7f280c33e6..3e8a854bcdec7 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder.h
@@ -44,6 +44,14 @@ class IOpBuilder {
 
   // Does the builder implementation support creating an ML Program?
   virtual bool SupportsMLProgram() const = 0;
+
+  // Is this op cheap enough that a CoreML partition consisting only of nodes
+  // like it isn't worth the marshalling cost? Used by the trivial-only
+  // partition heuristic in CoreMLExecutionProvider::GetCapability. Defaults
+  // to false; trivial-op builders override to true. Some builders dispatch
+  // multiple op types (e.g. UnaryOpBuilder), so the answer can depend on
+  // node.OpType().
+  virtual bool IsTrivial(const Node& /*node*/) const { return false; }
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index 6f465774a3c3c..fd0e19dbd055a 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -38,6 +38,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateUnaryOpBuilder("Round", op_registrations);
   CreateUnaryOpBuilder("Sqrt", op_registrations);
   CreateUnaryOpBuilder("Exp", op_registrations);
+  CreateUnaryOpBuilder("Ceil", op_registrations);
 
   // Binary elementwise ops
   CreateBinaryOpBuilder("Add", op_registrations);
@@ -77,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateGatherOpBuilder("Gather", op_registrations);
   CreateGemmOpBuilder("Gemm", op_registrations);
   CreateGridSampleOpBuilder("GridSample", op_registrations);
+  CreateIdentityOpBuilder("Identity", op_registrations);
   CreateLRNOpBuilder("LRN", op_registrations);
   CreateGemmOpBuilder("MatMul", op_registrations);
   CreatePadOpBuilder("Pad", op_registrations);
@@ -87,6 +89,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateSplitOpBuilder("Split", op_registrations);
   CreateSoftmaxOpBuilder("Softmax", op_registrations);
   CreateSqueezeOpBuilder("Squeeze", op_registrations);
+  CreateTileOpBuilder("Tile", op_registrations);
   CreateTransposeOpBuilder("Transpose", op_registrations);
   CreateSqueezeOpBuilder("Unsqueeze", op_registrations);
 
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index f6304848274de..d399a4f91576e 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -31,6 +31,7 @@ void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations&
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateIdentityOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
@@ -42,6 +43,7 @@ void CreateSliceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op
 void CreateSoftmaxOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateSqueezeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateTileOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index cc7beed6bb298..9dd3dfaf6c75a 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -11,6 +11,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/session/onnxruntime_cxx_api.h"
 
@@ -88,9 +89,32 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
         return MakeString(user_provided_key, "_", COREML, "_", model_hash, "_", metadef_id);
       };
 
-  result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {},
+  // Drop CoreML partitions that consist entirely of trivial shape / cheap-elementwise ops.
+  // These ops can each be claimed individually but the CPU↔CoreML round-trip cost
+  // (~50-100us marshalling) outweighs the saving when the partition has no compute-heavy
+  // op to amortise it over. Per-op CoreML dispatch cost is ~10-14us on M3 Max even for
+  // trivial ops (Identity/Ceil/Tile etc.), and CPU runs them in <1us each.
+  //
+  // The "trivial" marker lives on each op builder's IOpBuilder::IsTrivial(node)
+  // override rather than as a hardcoded set here, so adding a new trivial op
+  // builder doesn't risk drifting from a list maintained at the EP level.
+  const auto& op_builders = coreml::GetOpBuilders();
+  const auto is_node_trivial = [&](const Node* node) -> bool {
+    auto it = op_builders.find(node->OpType());
+    return it != op_builders.end() && it->second->IsTrivial(*node);
+  };
+  const auto is_node_supported = [&](const Node& node) -> bool {
+    return supported_nodes.find(&node) != supported_nodes.end();
+  };
+  const auto on_group_closed = [&](const std::vector<const Node*>& group) -> bool {
+    // Keep the partition only if at least one node is non-trivial.
+    return std::any_of(group.begin(), group.end(),
+                       [&](const Node* node) { return !is_node_trivial(node); });
+  };
+
+  result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
                                             gen_metadef_name, COREML, kCoreMLExecutionProvider,
-                                            nullptr,
+                                            /*node_unit_map*/ nullptr,
                                             /*drop_constant_initializers*/ true);
 
   const auto num_of_partitions = result.size();
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index b6e1545d6f319..bdfa3eeb8657b 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -12,6 +12,7 @@
 #include "core/graph/constants.h"
 #include "core/graph/graph.h"
 #include "core/graph/graph_viewer.h"
+#include "core/optimizer/graph_transformer_level.h"
 #include "core/providers/coreml/coreml_provider_factory_creator.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/session/inference_session.h"
@@ -1164,6 +1165,442 @@ TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) {
 #endif
 }
 
+// Build a model: input -> Conv -> <op_chain...> -> output. The Conv anchors
+// the partition so the trivial-partition heuristic keeps it; the chained ops
+// land inside a single CoreML partition rather than fragmenting it.
+namespace {
+ONNX_NAMESPACE::ModelProto MakeConvWithTrivialChainModel(
+    const std::string& trivial_op,
+    bool tile_with_repeats /*for Tile only*/) {
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_domain("");
+  opset->set_version(13);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("conv_chain_test");
+
+  auto add_value = [&](auto* proto, const char* name, const std::vector<int64_t>& shape) {
+    proto->set_name(name);
+    auto* tt = proto->mutable_type()->mutable_tensor_type();
+    tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d);
+  };
+  add_value(graph_proto->add_input(), "X", {1, 2, 4, 4});
+  add_value(graph_proto->add_output(), "Y", {1, 3, 3, 3});
+
+  // Conv weight initialiser
+  auto* w = graph_proto->add_initializer();
+  w->set_name("W");
+  w->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  for (int64_t d : {3, 2, 2, 2}) w->add_dims(d);
+  for (int i = 0; i < 24; ++i) w->add_float_data(0.05f * i - 0.4f);
+
+  auto* conv = graph_proto->add_node();
+  conv->set_op_type("Conv");
+  conv->add_input("X");
+  conv->add_input("W");
+  conv->add_output("conv_out");
+  auto* pads = conv->add_attribute();
+  pads->set_name("pads");
+  pads->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
+  for (int64_t v : {0, 0, 0, 0}) pads->add_ints(v);
+
+  if (trivial_op == "Tile") {
+    auto* reps_init = graph_proto->add_initializer();
+    reps_init->set_name("reps");
+    reps_init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+    reps_init->add_dims(4);
+    for (int64_t v : {1, 1, 1, 1}) reps_init->add_int64_data(v);
+    auto* node = graph_proto->add_node();
+    node->set_op_type("Tile");
+    node->add_input("conv_out");
+    node->add_input("reps");
+    node->add_output("Y");
+    (void)tile_with_repeats;
+  } else {
+    auto* node = graph_proto->add_node();
+    node->set_op_type(trivial_op);
+    node->add_input("conv_out");
+    node->add_output("Y");
+  }
+  return model_proto;
+}
+
+void RunConvChainTest(const std::string& trivial_op, std::string_view log_id) {
+  auto model_proto = MakeConvWithTrivialChainModel(trivial_op, false);
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 2, 4, 4};
+  std::vector<float> x_data(32);
+  for (size_t i = 0; i < x_data.size(); ++i) x_data[i] = static_cast<float>(i) * 0.1f - 1.5f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, x_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  RunAndVerifyOutputsWithEP(model_span, std::string(log_id),
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+}  // namespace
+
+TEST(CoreMLExecutionProviderTest, IdentityWithConvAnchor) {
+  // Conv → Identity → output. Conv anchors the partition; Identity must be
+  // claimed (the trivial-partition heuristic keeps it because Conv is present).
+  RunConvChainTest("Identity", "IdentityWithConvAnchor_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, CeilWithConvAnchor) {
+  // Conv → Ceil → output. Same rationale; Ceil is also a unary MIL op.
+  RunConvChainTest("Ceil", "CeilWithConvAnchor_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, TileWithConvAnchor) {
+  // Conv → Tile(reps=[1,1,1,1]) → output. Validates the Tile builder claims
+  // the node alongside the Conv anchor.
+  RunConvChainTest("Tile", "TileWithConvAnchor_MLProgram");
+}
+
+// Helper for trivial-only chain tests. Builds a model with input X[dims] and
+// output Y[dims], populates the graph body via `populate_chain`, and asserts
+// the CoreML EP claims none of it. Graph optimisations are pinned to Default
+// so passes like IdentityElimination / CastElimination do not pre-empt the
+// trivial-partition heuristic in CoreMLExecutionProvider::GetCapability.
+namespace {
+void RunTrivialOnlyChainTest(
+    std::string_view log_id,
+    const std::vector<int64_t>& dims,
+    const std::vector<float>& x_data,
+    const std::function<void(ONNX_NAMESPACE::GraphProto*)>& populate_chain) {
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_domain("");
+  opset->set_version(13);
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("trivial_only");
+
+  auto add_value = [&](auto* proto, const char* name, const std::vector<int64_t>& shape) {
+    proto->set_name(name);
+    auto* tt = proto->mutable_type()->mutable_tensor_type();
+    tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d);
+  };
+  add_value(graph_proto->add_input(), "X", dims);
+  add_value(graph_proto->add_output(), "Y", dims);
+
+  populate_chain(graph_proto);
+
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, x_data, &ml_value_x);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  auto disable_optimizations = [](SessionOptions& so) {
+    so.graph_optimization_level = TransformerLevel::Default;
+  };
+
+  RunAndVerifyOutputsWithEP(model_span, std::string(log_id),
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::None},
+                            disable_optimizations);
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None);
+#endif
+}
+}  // namespace
+
+TEST(CoreMLExecutionProviderTest, TrivialOnlyChainIsNotClaimedByCoreML) {
+  // 3 chained Identity nodes with no compute-heavy anchor → heuristic drops the
+  // partition so CPU runs it. Round-trip cost would exceed the saving otherwise.
+  RunTrivialOnlyChainTest(
+      "TrivialOnlyChainIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto* n1 = graph->add_node();
+        n1->set_op_type("Identity");
+        n1->add_input("X");
+        n1->add_output("a");
+        auto* n2 = graph->add_node();
+        n2->set_op_type("Identity");
+        n2->add_input("a");
+        n2->add_output("b");
+        auto* n3 = graph->add_node();
+        n3->set_op_type("Identity");
+        n3->add_input("b");
+        n3->add_output("Y");
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, ReshapeOnlyChainIsNotClaimedByCoreML) {
+  RunTrivialOnlyChainTest(
+      "ReshapeOnlyChainIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto add_shape_init = [&](const char* name, const std::vector<int64_t>& shape) {
+          auto* init = graph->add_initializer();
+          init->set_name(name);
+          init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+          init->add_dims(static_cast<int64_t>(shape.size()));
+          for (int64_t v : shape) init->add_int64_data(v);
+        };
+        add_shape_init("shape_a", {2, 4});
+        add_shape_init("shape_b", {1, 8});
+
+        auto* n1 = graph->add_node();
+        n1->set_op_type("Reshape");
+        n1->add_input("X");
+        n1->add_input("shape_a");
+        n1->add_output("a");
+        auto* n2 = graph->add_node();
+        n2->set_op_type("Reshape");
+        n2->add_input("a");
+        n2->add_input("shape_b");
+        n2->add_output("Y");
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, TransposeOnlyChainIsNotClaimedByCoreML) {
+  RunTrivialOnlyChainTest(
+      "TransposeOnlyChainIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto add_transpose = [&](const char* name, const char* in, const char* out,
+                                 const std::vector<int64_t>& perm) {
+          auto* node = graph->add_node();
+          node->set_name(name);
+          node->set_op_type("Transpose");
+          node->add_input(in);
+          node->add_output(out);
+          auto* attr = node->add_attribute();
+          attr->set_name("perm");
+          attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
+          for (int64_t v : perm) attr->add_ints(v);
+        };
+        // Two Transposes that compose back to the identity perm.
+        add_transpose("t0", "X", "a", {1, 0});
+        add_transpose("t1", "a", "Y", {1, 0});
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, TileOnlyIsNotClaimedByCoreML) {
+  // Single Tile with reps=[1,1] — pure data movement, no compute anchor.
+  RunTrivialOnlyChainTest(
+      "TileOnlyIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto* reps = graph->add_initializer();
+        reps->set_name("reps");
+        reps->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+        reps->add_dims(2);
+        reps->add_int64_data(1);
+        reps->add_int64_data(1);
+        auto* n = graph->add_node();
+        n->set_op_type("Tile");
+        n->add_input("X");
+        n->add_input("reps");
+        n->add_output("Y");
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, CeilOnlyIsNotClaimedByCoreML) {
+  // Single Ceil — supported by the new Unary builder but trivial; heuristic drops it.
+  RunTrivialOnlyChainTest(
+      "CeilOnlyIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.1f, 0.6f, 1.4f, 1.9f, -0.6f, -1.4f, 2.5f, 3.1f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto* n = graph->add_node();
+        n->set_op_type("Ceil");
+        n->add_input("X");
+        n->add_output("Y");
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, MixedTrivialChainIsNotClaimedByCoreML) {
+  // Identity → Cast(float→float) → Reshape → Transpose. Different trivial ops in
+  // sequence; with no compute-heavy anchor the heuristic drops the whole partition.
+  RunTrivialOnlyChainTest(
+      "MixedTrivialChainIsNotClaimedByCoreML_MLProgram",
+      {1, 8},
+      {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+      [](ONNX_NAMESPACE::GraphProto* graph) {
+        auto* shape_init = graph->add_initializer();
+        shape_init->set_name("reshape_shape");
+        shape_init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+        shape_init->add_dims(2);
+        shape_init->add_int64_data(8);
+        shape_init->add_int64_data(1);
+
+        auto* identity = graph->add_node();
+        identity->set_op_type("Identity");
+        identity->add_input("X");
+        identity->add_output("a");
+
+        auto* cast = graph->add_node();
+        cast->set_op_type("Cast");
+        cast->add_input("a");
+        cast->add_output("b");
+        auto* to_attr = cast->add_attribute();
+        to_attr->set_name("to");
+        to_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+        to_attr->set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+        auto* reshape = graph->add_node();
+        reshape->set_op_type("Reshape");
+        reshape->add_input("b");
+        reshape->add_input("reshape_shape");
+        reshape->add_output("c");
+
+        auto* transpose = graph->add_node();
+        transpose->set_op_type("Transpose");
+        transpose->add_input("c");
+        transpose->add_output("Y");
+        auto* perm_attr = transpose->add_attribute();
+        perm_attr->set_name("perm");
+        perm_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
+        perm_attr->add_ints(1);
+        perm_attr->add_ints(0);
+      });
+}
+
+TEST(CoreMLExecutionProviderTest, ConvTrivialChainConvKeepsAllOnCoreML) {
+  // Sandwich test: Conv → Identity → Cast → Reshape → Conv. The two Convs
+  // make the partition non-trivial, so the heuristic keeps the trivial ops in
+  // the same partition rather than splitting them off to CPU. Verifies the
+  // "stay on GPU for GPU chains" half of the heuristic.
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_domain("");
+  opset->set_version(13);
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("conv_trivial_conv_sandwich");
+
+  auto add_value = [&](auto* proto, const char* name, const std::vector<int64_t>& shape) {
+    proto->set_name(name);
+    auto* tt = proto->mutable_type()->mutable_tensor_type();
+    tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d);
+  };
+  add_value(graph_proto->add_input(), "X", {1, 2, 4, 4});
+  add_value(graph_proto->add_output(), "Y", {1, 2, 3, 3});
+
+  // Conv1: weight [3, 2, 2, 2], output [1, 3, 3, 3]
+  auto* w1 = graph_proto->add_initializer();
+  w1->set_name("W1");
+  w1->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  for (int64_t d : {3, 2, 2, 2}) w1->add_dims(d);
+  for (int i = 0; i < 24; ++i) w1->add_float_data(0.05f * i - 0.4f);
+
+  // Conv2: weight [2, 3, 1, 1], output [1, 2, 3, 3]
+  auto* w2 = graph_proto->add_initializer();
+  w2->set_name("W2");
+  w2->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  for (int64_t d : {2, 3, 1, 1}) w2->add_dims(d);
+  for (int i = 0; i < 6; ++i) w2->add_float_data(0.1f * i - 0.25f);
+
+  // Reshape shape initializer (no-op reshape: [1,3,3,3] → [1,3,3,3])
+  auto* reshape_shape = graph_proto->add_initializer();
+  reshape_shape->set_name("reshape_shape");
+  reshape_shape->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  reshape_shape->add_dims(4);
+  for (int64_t v : {1, 3, 3, 3}) reshape_shape->add_int64_data(v);
+
+  auto add_pads_attr = [](ONNX_NAMESPACE::NodeProto* node) {
+    auto* pads = node->add_attribute();
+    pads->set_name("pads");
+    pads->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
+    for (int64_t v : {0, 0, 0, 0}) pads->add_ints(v);
+  };
+
+  auto* conv1 = graph_proto->add_node();
+  conv1->set_op_type("Conv");
+  conv1->add_input("X");
+  conv1->add_input("W1");
+  conv1->add_output("conv1_out");
+  add_pads_attr(conv1);
+
+  auto* identity = graph_proto->add_node();
+  identity->set_op_type("Identity");
+  identity->add_input("conv1_out");
+  identity->add_output("ident_out");
+
+  auto* cast = graph_proto->add_node();
+  cast->set_op_type("Cast");
+  cast->add_input("ident_out");
+  cast->add_output("cast_out");
+  auto* to_attr = cast->add_attribute();
+  to_attr->set_name("to");
+  to_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  to_attr->set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+  auto* reshape = graph_proto->add_node();
+  reshape->set_op_type("Reshape");
+  reshape->add_input("cast_out");
+  reshape->add_input("reshape_shape");
+  reshape->add_output("reshape_out");
+
+  auto* conv2 = graph_proto->add_node();
+  conv2->set_op_type("Conv");
+  conv2->add_input("reshape_out");
+  conv2->add_input("W2");
+  conv2->add_output("Y");
+  add_pads_attr(conv2);
+
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 2, 4, 4};
+  std::vector<float> x_data(32);
+  for (size_t i = 0; i < x_data.size(); ++i) x_data[i] = static_cast<float>(i) * 0.1f - 1.5f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, x_data, &ml_value_x);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  // Disable optimisations so the trivial ops survive into partitioning and we
+  // actually verify the heuristic (otherwise IdentityElimination / similar
+  // passes could remove them before CoreML's GetCapability runs).
+  auto disable_optimizations = [](SessionOptions& so) {
+    so.graph_optimization_level = TransformerLevel::Default;
+  };
+
+  RunAndVerifyOutputsWithEP(model_span, "ConvTrivialChainConvKeepsAllOnCoreML_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All},
+                            disable_optimizations);
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
 namespace {
 // Build a single-node com.microsoft:FusedConv model for the tests below.
 // Input X is {1, 2, 4, 4}, weight W is {3, 2, 2, 2} (constant initializer, set
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 395813844906a..106280d258ecb 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -12,6 +12,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
 |ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
 |ai.onnx:DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
+|ai.onnx:Ceil||
 |ai.onnx:Div||
 |ai.onnx:Elu||
 |ai.onnx:Erf||
@@ -23,6 +24,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:GridSample|4D input.<br/>'mode' of 'linear' or 'zeros'.<br/>(mode==linear && padding_mode==reflection && align_corners==0) is not supported.|
 |ai.onnx:GroupNormalization||
 |ai.onnx:HardSigmoid||
+|ai.onnx:Identity||
 |ai.onnx:InstanceNormalization||
 |ai.onnx:LayerNormalization||
 |ai.onnx:LeakyRelu||
@@ -50,6 +52,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Sqrt||
 |ai.onnx:Squeeze||
 |ai.onnx:Tanh||
+|ai.onnx:Tile|`repeats` may be a constant initializer or a runtime tensor (MLProgram only). Input rank up to 5.|
 |ai.onnx:Transpose||
 |ai.onnx:Unsqueeze||
 |com.microsoft:QuickGelu|Produced by ORT's `QuickGeluFusion` optimizer pass. Decomposed into `mul` / `sigmoid` / `mul`.|