diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
index 8b58f5dc6c927..54551b35649b7 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
@@ -30,27 +30,121 @@ int64_t GetAxisAttribute(const Node& node) {
 }  // namespace
 
 Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                              const logging::Logger& /*logger*/) const {
+                                              const logging::Logger& logger) const {
+  const auto axis = GetAxisAttribute(node);
+  const auto& data_def = *node.InputDefs()[0];
+  const auto& indices_def = *node.InputDefs()[1];
+  const auto& output_def = *node.OutputDefs()[0];
+
+  std::vector<int64_t> data_shape, indices_shape;
+  ORT_RETURN_IF_NOT(GetShape(data_def, data_shape, logger), "Failed to get 'data' shape");
+  ORT_RETURN_IF_NOT(GetShape(indices_def, indices_shape, logger), "Failed to get 'indices' shape");
+
+  // ONNX Gather: out_shape = data_shape[:axis] + indices_shape + data_shape[axis+1:]
+  // CoreML's gather requires rank-1+ indices, so for scalar indices we promote
+  // them to [1], gather, and then squeeze the resulting axis to restore the
+  // original output rank. The positive axis after wrapping is needed for the
+  // squeeze axis below regardless of path.
+  const bool scalar_indices = indices_shape.empty();
+  const int64_t pos_axis = HandleNegativeAxis(axis, data_shape.size());
+
   if (model_builder.CreateMLProgram()) {
     using CoreML::Specification::MILSpec::Operation;
-    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "gather");
-
-    const auto axis = GetAxisAttribute(node);
+    // IsOpSupportedImpl gates indices to INT32 or INT64, so we can pass the
+    // dtype straight through to the reshape's intermediate output.
+    int32_t indices_dtype{};
+    ORT_RETURN_IF_NOT(GetType(indices_def, indices_dtype, logger),
+                      "Failed to get 'indices' dtype");
+    const int32_t output_dtype = static_cast<int32_t>(output_def.TypeAsProto()->tensor_type().elem_type());
+
+    std::string indices_name = indices_def.Name();
+
+    if (scalar_indices) {
+      // [] -> [1] via reshape. We use reshape rather than expand_dims because
+      // CoreML internally pads scalars; expand_dims on the padded tensor can
+      // push the apparent rank past the rank-5 limit on high-rank `data`.
+      auto reshape = model_builder.CreateOperation(node, "reshape", "indices");
+      AddOperationInput(*reshape, "x", indices_def.Name());
+      const std::vector<int64_t> indices_1d_shape = {1};
+      AddOperationInput(*reshape, "shape",
+                        model_builder.AddConstant(reshape->type(), "shape", indices_1d_shape));
+
+      indices_name = model_builder.GetUniqueName(node, "indices_1d");
+      AddIntermediateOperationOutput(*reshape, indices_name, indices_dtype, indices_1d_shape);
+      model_builder.AddOperation(std::move(reshape));
+    }
+
+    std::unique_ptr<Operation> gather = model_builder.CreateOperation(node, "gather");
     // coreml docs claims validate_indices is optional but in practice it is required
     const auto validate_indices = false;
-    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());                                   // data
-    AddOperationInput(*op, "indices", node.InputDefs()[1]->Name());                             // indices
-    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis));  // axis attr
-    AddOperationInput(*op, "validate_indices", model_builder.AddScalarConstant(op->type(), "validate_indices", validate_indices));
-    AddOperationOutput(*op, *node.OutputDefs()[0]);  // output
-    model_builder.AddOperation(std::move(op));
+    AddOperationInput(*gather, "x", data_def.Name());
+    AddOperationInput(*gather, "indices", indices_name);
+    AddOperationInput(*gather, "axis", model_builder.AddScalarConstant(gather->type(), "axis", axis));
+    AddOperationInput(*gather, "validate_indices",
+                      model_builder.AddScalarConstant(gather->type(), "validate_indices", validate_indices));
+
+    if (!scalar_indices) {
+      AddOperationOutput(*gather, output_def);
+      model_builder.AddOperation(std::move(gather));
+    } else {
+      // gather output here has the data's rank (one more than ONNX scalar-gather output);
+      // squeeze the inserted axis to recover the original output shape.
+      std::vector<int64_t> gather_shape = data_shape;
+      gather_shape[pos_axis] = 1;
+      const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_out");
+      AddIntermediateOperationOutput(*gather, gather_out_name, output_dtype, gather_shape);
+      model_builder.AddOperation(std::move(gather));
+
+      auto squeeze = model_builder.CreateOperation(node, "squeeze", "post");
+      AddOperationInput(*squeeze, "x", gather_out_name);
+      const std::vector<int64_t> sq_axes = {pos_axis};
+      AddOperationInput(*squeeze, "axes", model_builder.AddConstant(squeeze->type(), "axes", sq_axes));
+      AddOperationOutput(*squeeze, output_def);
+      model_builder.AddOperation(std::move(squeeze));
+    }
   } else {
-    auto layer = model_builder.CreateNNLayer(node);
-    layer->mutable_gather()->set_axis(GetAxisAttribute(node));
-    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();    // data
-    *layer->mutable_input()->Add() = node.InputDefs()[1]->Name();    // indices
-    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();  // output
-    model_builder.AddLayer(std::move(layer));
+    if (!scalar_indices) {
+      auto layer = model_builder.CreateNNLayer(node);
+      layer->mutable_gather()->set_axis(axis);
+      *layer->mutable_input()->Add() = data_def.Name();
+      *layer->mutable_input()->Add() = indices_def.Name();
+      *layer->mutable_output()->Add() = output_def.Name();
+      model_builder.AddLayer(std::move(layer));
+    } else {
+      // expand_dims indices: [] -> [1]. Unlike the MLProgram reshape path
+      // above, NN's expand_dims doesn't internally pad rank, so we don't run
+      // into the apparent-rank inflation that forced reshape+gather there;
+      // expand_dims is the natural choice on this path.
+      const std::string& indices_1d_name = model_builder.GetUniqueName(node, "indices_1d");
+      {
+        auto expand_layer = model_builder.CreateNNLayer(node, "_indices_expand");
+        expand_layer->mutable_expanddims()->add_axes(0);
+        *expand_layer->mutable_input()->Add() = indices_def.Name();
+        *expand_layer->mutable_output()->Add() = indices_1d_name;
+        model_builder.AddLayer(std::move(expand_layer));
+      }
+
+      // gather with the promoted indices
+      const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_out");
+      {
+        auto gather_layer = model_builder.CreateNNLayer(node);
+        gather_layer->mutable_gather()->set_axis(axis);
+        *gather_layer->mutable_input()->Add() = data_def.Name();
+        *gather_layer->mutable_input()->Add() = indices_1d_name;
+        *gather_layer->mutable_output()->Add() = gather_out_name;
+        model_builder.AddLayer(std::move(gather_layer));
+      }
+
+      // squeeze the inserted axis
+      {
+        auto squeeze_layer = model_builder.CreateNNLayer(node, "_post_squeeze");
+        squeeze_layer->mutable_squeeze()->add_axes(pos_axis);
+        squeeze_layer->mutable_squeeze()->set_squeezeall(false);
+        *squeeze_layer->mutable_input()->Add() = gather_out_name;
+        *squeeze_layer->mutable_output()->Add() = output_def.Name();
+        model_builder.AddLayer(std::move(squeeze_layer));
+      }
+    }
   }
   return Status::OK();
 }
@@ -87,14 +181,45 @@ bool GatherOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
     return false;
   }
 
-  // Don't allow scalar 'indices' input.
-  // We convert scalar inputs to tensors with shape [1] before providing them to CoreML.
-  // This modification changes the shape of the Gather output.
-  if (indices_shape.empty()) {
-    LOGS(logger, VERBOSE) << "Gather does not support scalar 'indices'";
+  // ONNX Gather schema constrains indices to int32 or int64. Validate here so
+  // AddToModelBuilderImpl can trust the dtype rather than silently defaulting
+  // on an unexpected value.
+  int32_t indices_dtype{};
+  if (!GetType(*node.InputDefs()[1], indices_dtype, logger)) {
     return false;
   }
+  if (indices_dtype != ONNX_NAMESPACE::TensorProto_DataType_INT32 &&
+      indices_dtype != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    LOGS(logger, VERBOSE) << "Gather 'indices' dtype [" << indices_dtype
+                          << "] is not supported (expected INT32 or INT64)";
+    return false;
+  }
+
+  // For scalar indices we internally emit gather with promoted [1] indices
+  // then squeeze. That requires us to claim a static intermediate shape, so
+  // we only handle scalar indices when the data shape itself is fully
+  // static. (Dynamic-shape scalar Gather still falls back to CPU.)
+  if (indices_shape.empty()) {
+    if (!IsStaticShape(data_shape)) {
+      LOGS(logger, VERBOSE) << "Gather with scalar 'indices' requires static 'data' shape";
+      return false;
+    }
+    // The pre-squeeze intermediate has the same rank as `data`. CoreML's
+    // compiler reports "Invalid rank: 6" when a rank-5 intermediate is
+    // produced via reshape+gather, even though rank-5 intermediates are
+    // accepted in other op chains. Cap scalar-indices Gather at data rank 4
+    // until that compiler limit is lifted.
+    //
+    // TODO: re-test on newer macOS / CoreML versions; if Apple lifts the
+    // intermediate rank limit, this cap can be raised to 5 (matching the
+    // general Gather output-rank check below).
+    if (data_shape.size() > 4) {
+      LOGS(logger, VERBOSE) << "Gather with scalar 'indices' supports 'data' rank up to 4";
+      return false;
+    }
+  }
 
+  // Output rank = data_rank + indices_rank - 1. The rank-5 limit applies.
   if (data_shape.size() + indices_shape.size() - 1 > 5) {
     LOGS(logger, VERBOSE) << "Gather does not support output with rank greater than 5";
     return false;
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index b6e1545d6f319..0ff5f54a31159 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -240,9 +240,10 @@ TEST(CoreMLExecutionProviderTest, ArgMaxUnsupportedCastTest) {
 }
 
 TEST(CoreMLExecutionProviderTest, GatherWithScalarIndices) {
-  // For scalar inputs, the input shape is modified from [] -> [1] before passing the input to CoreML.
-  // This won't work for Gather because the output shape depends on the `indices` input shape which could be a scalar.
-  // Currently, we expect the CoreML EP to only take the Shape node in this graph (Gather -> Shape).
+  // The CoreML EP supports scalar 'indices' for Gather only when the 'data' input has a fully
+  // static shape (it needs to claim a static intermediate shape for the post-gather squeeze).
+  // This model's 'data' input is dynamic ([M, N, K]) so Gather still falls back to CPU and the
+  // CoreML EP only takes the Shape node.
   const auto model_file_name = ORT_TSTR("testdata/gather_with_scalar_indices_then_shape.onnx");
 
 #if defined(__APPLE__)
@@ -1164,6 +1165,589 @@ TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) {
 #endif
 }
 
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis1) {
+  // ai.onnx:Gather with rank-0 (scalar) 'indices'. ONNX output rank =
+  // data_rank + indices_rank - 1 = 2. The CoreML builder internally promotes
+  // indices to [1], runs gather, then squeezes the inserted axis. Pattern
+  // produced by StyleGAN-family generators (e.g. GFPGAN) that pick a
+  // per-layer style code with a scalar index.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_axis1", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  // data X: {1, 4, 8} float
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(1);
+  data_shape->add_dim()->set_dim_value(4);
+  data_shape->add_dim()->set_dim_value(8);
+
+  // output Y: {1, 8}
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  output_shape->add_dim()->set_dim_value(1);
+  output_shape->add_dim()->set_dim_value(8);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  // Scalar int64 index with value 2.
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  // No dims => rank-0 tensor.
+  idx_init.add_int64_data(2);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar", "Gather", "Gather with scalar indices",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(1));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 4, 8};
+  std::vector<float> input_data(1 * 4 * 8);
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>(i) * 0.25f - 1.0f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis1_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis1_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis0) {
+  // Scalar Gather along axis 0 — squeeze axis is 0; covers a different
+  // squeeze position than the axis=1 test.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_axis0", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  // data X: {6, 5} float
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(6);
+  data_shape->add_dim()->set_dim_value(5);
+
+  // output Y: {5}
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  output_shape->add_dim()->set_dim_value(5);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(4);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_axis0", "Gather", "Gather scalar idx axis=0",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(0));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {6, 5};
+  std::vector<float> input_data(6 * 5);
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>(i) - 12.5f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis0_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis0_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesNegativeAxis) {
+  // Scalar Gather with negative axis (-1) — verifies HandleNegativeAxis is
+  // applied when computing the squeeze axis.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_negative_axis", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  // data X: {2, 3, 4} float
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(2);
+  data_shape->add_dim()->set_dim_value(3);
+  data_shape->add_dim()->set_dim_value(4);
+
+  // output Y: {2, 3} (axis=-1 == axis 2; output drops that axis)
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  output_shape->add_dim()->set_dim_value(2);
+  output_shape->add_dim()->set_dim_value(3);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(1);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_neg_axis", "Gather", "Gather scalar idx axis=-1",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(-1));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {2, 3, 4};
+  std::vector<float> input_data(2 * 3 * 4);
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>(i) * 0.5f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesNegativeAxis_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesNegativeAxis_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesFloat16) {
+  // FLOAT16 'data' input. HasSupportedInputsImpl restricts fp16 Gather to
+  // MLProgram on CoreML 6+, so this test only runs the MLProgram path.
+  // Exercises the MLFloat16 branch of the static intermediate shape claim.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_fp16", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(1);
+  data_shape->add_dim()->set_dim_value(4);
+  data_shape->add_dim()->set_dim_value(8);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  output_shape->add_dim()->set_dim_value(1);
+  output_shape->add_dim()->set_dim_value(8);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(2);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_fp16", "Gather", "Gather scalar idx fp16 data",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(1));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 4, 8};
+  std::vector<MLFloat16> input_data;
+  input_data.reserve(1 * 4 * 8);
+  for (size_t i = 0; i < 1 * 4 * 8; ++i) {
+    input_data.emplace_back(static_cast<float>(i) * 0.25f - 1.0f);
+  }
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<MLFloat16>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesFloat16_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesInt64Data) {
+  // INT64 'data' input. HasSupportedInputsImpl allows int64 in both NN and
+  // MLProgram; verify both formats correctly route int64 through the
+  // expand/gather/squeeze chain.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_int64_data", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(3);
+  data_shape->add_dim()->set_dim_value(4);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  output_shape->add_dim()->set_dim_value(4);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(1);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_int64", "Gather", "Gather scalar idx int64 data",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(0));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {3, 4};
+  std::vector<int64_t> input_data;
+  input_data.reserve(3 * 4);
+  for (int64_t i = 0; i < 3 * 4; ++i) input_data.push_back(i * 1000 - 5000);
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<int64_t>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt64Data_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt64Data_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesInt32Indices) {
+  // INT32 'indices'. The other scalar-indices tests use INT64 indices (the
+  // PyTorch default); this one exercises the INT32 branch through both the
+  // dtype gating in IsOpSupportedImpl and the indices_dtype path-through to
+  // the reshape's intermediate output dtype in AddToModelBuilderImpl.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_int32_indices", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_value(3);
+  data_shape->add_dim()->set_dim_value(4);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(4);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT32);
+  idx_init.add_int32_data(2);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_int32_idx", "Gather", "Gather scalar int32 idx",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(0));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {3, 4};
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt32Indices_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt32Indices_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesRank4Data) {
+  // Rank-4 'data' input — the supported maximum for scalar Gather (the
+  // pre-squeeze intermediate is rank 4; CoreML's compiler rejects scalar
+  // Gather at rank 5 with "Invalid rank: 6"). Output is rank 3.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_rank4", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  for (int64_t d : {2, 5, 3, 4}) data_shape->add_dim()->set_dim_value(d);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  // Gather on axis=1 with scalar idx removes that axis: {2,3,4}
+  for (int64_t d : {2, 3, 4}) output_shape->add_dim()->set_dim_value(d);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(3);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_rank4", "Gather", "Gather scalar idx rank-4 data",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(1));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {2, 5, 3, 4};
+  std::vector<float> input_data(2 * 5 * 3 * 4);
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>(i) * 0.1f - 5.0f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesRank4Data_NN",
+                            MakeCoreMLExecutionProvider(),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+  RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesRank4Data_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesDynamicDataNotSupported) {
+  // The scalar-indices path emits a reshape-+squeeze chain whose intermediate
+  // shape we have to claim statically. IsOpSupportedImpl rejects the node
+  // when 'data' has any unknown dim so it falls back to CPU rather than
+  // produce an ill-formed CoreML program.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_dynamic_data", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  data_shape->add_dim()->set_dim_param("N");  // dynamic leading dim
+  data_shape->add_dim()->set_dim_value(4);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("N");
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(0);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_dyn", "Gather", "Gather scalar idx, dynamic data",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(1));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None);
+}
+
+TEST(CoreMLExecutionProviderTest, GatherScalarIndicesRank5DataNotSupported) {
+  // Scalar-indices Gather caps data rank at 4 (CoreML compiler reports
+  // "Invalid rank: 6" on the rank-5 reshape+gather intermediate). Rank-5
+  // 'data' must fall back to CPU.
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}};
+  onnxruntime::Model model("gather_scalar_indices_rank5", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto data_type;
+  data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* data_shape = data_type.mutable_tensor_type()->mutable_shape();
+  for (int64_t d : {2, 3, 4, 5, 6}) data_shape->add_dim()->set_dim_value(d);
+
+  ONNX_NAMESPACE::TypeProto output_type;
+  output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  auto* output_shape = output_type.mutable_tensor_type()->mutable_shape();
+  // axis=2 with scalar idx removes that axis: {2,3,5,6}
+  for (int64_t d : {2, 3, 5, 6}) output_shape->add_dim()->set_dim_value(d);
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type);
+
+  ONNX_NAMESPACE::TensorProto idx_init;
+  idx_init.set_name("idx");
+  idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  idx_init.add_int64_data(2);
+  graph.AddInitializedTensor(idx_init);
+  auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr);
+
+  auto& node = graph.AddNode("gather_scalar_rank5", "Gather", "Gather scalar idx rank-5 data",
+                             {&input_arg, &idx_arg}, {&output_arg});
+  node.AddAttribute("axis", static_cast<int64_t>(2));
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None);
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None);
+}
+
 namespace {
 // Build a single-node com.microsoft:FusedConv model for the tests below.
 // Input X is {1, 2, 4, 4}, weight W is {3, 2, 2, 2} (constant initializer, set