diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc index 8b58f5dc6c927..54551b35649b7 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc @@ -30,27 +30,121 @@ int64_t GetAxisAttribute(const Node& node) { } // namespace Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, - const logging::Logger& /*logger*/) const { + const logging::Logger& logger) const { + const auto axis = GetAxisAttribute(node); + const auto& data_def = *node.InputDefs()[0]; + const auto& indices_def = *node.InputDefs()[1]; + const auto& output_def = *node.OutputDefs()[0]; + + std::vector data_shape, indices_shape; + ORT_RETURN_IF_NOT(GetShape(data_def, data_shape, logger), "Failed to get 'data' shape"); + ORT_RETURN_IF_NOT(GetShape(indices_def, indices_shape, logger), "Failed to get 'indices' shape"); + + // ONNX Gather: out_shape = data_shape[:axis] + indices_shape + data_shape[axis+1:] + // CoreML's gather requires rank-1+ indices, so for scalar indices we promote + // them to [1], gather, and then squeeze the resulting axis to restore the + // original output rank. The positive axis after wrapping is needed for the + // squeeze axis below regardless of path. + const bool scalar_indices = indices_shape.empty(); + const int64_t pos_axis = HandleNegativeAxis(axis, data_shape.size()); + if (model_builder.CreateMLProgram()) { using CoreML::Specification::MILSpec::Operation; - std::unique_ptr op = model_builder.CreateOperation(node, "gather"); - - const auto axis = GetAxisAttribute(node); + // IsOpSupportedImpl gates indices to INT32 or INT64, so we can pass the + // dtype straight through to the reshape's intermediate output. + int32_t indices_dtype{}; + ORT_RETURN_IF_NOT(GetType(indices_def, indices_dtype, logger), + "Failed to get 'indices' dtype"); + const int32_t output_dtype = static_cast(output_def.TypeAsProto()->tensor_type().elem_type()); + + std::string indices_name = indices_def.Name(); + + if (scalar_indices) { + // [] -> [1] via reshape. We use reshape rather than expand_dims because + // CoreML internally pads scalars; expand_dims on the padded tensor can + // push the apparent rank past the rank-5 limit on high-rank `data`. + auto reshape = model_builder.CreateOperation(node, "reshape", "indices"); + AddOperationInput(*reshape, "x", indices_def.Name()); + const std::vector indices_1d_shape = {1}; + AddOperationInput(*reshape, "shape", + model_builder.AddConstant(reshape->type(), "shape", indices_1d_shape)); + + indices_name = model_builder.GetUniqueName(node, "indices_1d"); + AddIntermediateOperationOutput(*reshape, indices_name, indices_dtype, indices_1d_shape); + model_builder.AddOperation(std::move(reshape)); + } + + std::unique_ptr gather = model_builder.CreateOperation(node, "gather"); // coreml docs claims validate_indices is optional but in practice it is required const auto validate_indices = false; - AddOperationInput(*op, "x", node.InputDefs()[0]->Name()); // data - AddOperationInput(*op, "indices", node.InputDefs()[1]->Name()); // indices - AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis)); // axis attr - AddOperationInput(*op, "validate_indices", model_builder.AddScalarConstant(op->type(), "validate_indices", validate_indices)); - AddOperationOutput(*op, *node.OutputDefs()[0]); // output - model_builder.AddOperation(std::move(op)); + AddOperationInput(*gather, "x", data_def.Name()); + AddOperationInput(*gather, "indices", indices_name); + AddOperationInput(*gather, "axis", model_builder.AddScalarConstant(gather->type(), "axis", axis)); + AddOperationInput(*gather, "validate_indices", + model_builder.AddScalarConstant(gather->type(), "validate_indices", validate_indices)); + + if (!scalar_indices) { + AddOperationOutput(*gather, output_def); + model_builder.AddOperation(std::move(gather)); + } else { + // gather output here has the data's rank (one more than ONNX scalar-gather output); + // squeeze the inserted axis to recover the original output shape. + std::vector gather_shape = data_shape; + gather_shape[pos_axis] = 1; + const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_out"); + AddIntermediateOperationOutput(*gather, gather_out_name, output_dtype, gather_shape); + model_builder.AddOperation(std::move(gather)); + + auto squeeze = model_builder.CreateOperation(node, "squeeze", "post"); + AddOperationInput(*squeeze, "x", gather_out_name); + const std::vector sq_axes = {pos_axis}; + AddOperationInput(*squeeze, "axes", model_builder.AddConstant(squeeze->type(), "axes", sq_axes)); + AddOperationOutput(*squeeze, output_def); + model_builder.AddOperation(std::move(squeeze)); + } } else { - auto layer = model_builder.CreateNNLayer(node); - layer->mutable_gather()->set_axis(GetAxisAttribute(node)); - *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); // data - *layer->mutable_input()->Add() = node.InputDefs()[1]->Name(); // indices - *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); // output - model_builder.AddLayer(std::move(layer)); + if (!scalar_indices) { + auto layer = model_builder.CreateNNLayer(node); + layer->mutable_gather()->set_axis(axis); + *layer->mutable_input()->Add() = data_def.Name(); + *layer->mutable_input()->Add() = indices_def.Name(); + *layer->mutable_output()->Add() = output_def.Name(); + model_builder.AddLayer(std::move(layer)); + } else { + // expand_dims indices: [] -> [1]. Unlike the MLProgram reshape path + // above, NN's expand_dims doesn't internally pad rank, so we don't run + // into the apparent-rank inflation that forced reshape+gather there; + // expand_dims is the natural choice on this path. + const std::string& indices_1d_name = model_builder.GetUniqueName(node, "indices_1d"); + { + auto expand_layer = model_builder.CreateNNLayer(node, "_indices_expand"); + expand_layer->mutable_expanddims()->add_axes(0); + *expand_layer->mutable_input()->Add() = indices_def.Name(); + *expand_layer->mutable_output()->Add() = indices_1d_name; + model_builder.AddLayer(std::move(expand_layer)); + } + + // gather with the promoted indices + const std::string& gather_out_name = model_builder.GetUniqueName(node, "gather_out"); + { + auto gather_layer = model_builder.CreateNNLayer(node); + gather_layer->mutable_gather()->set_axis(axis); + *gather_layer->mutable_input()->Add() = data_def.Name(); + *gather_layer->mutable_input()->Add() = indices_1d_name; + *gather_layer->mutable_output()->Add() = gather_out_name; + model_builder.AddLayer(std::move(gather_layer)); + } + + // squeeze the inserted axis + { + auto squeeze_layer = model_builder.CreateNNLayer(node, "_post_squeeze"); + squeeze_layer->mutable_squeeze()->add_axes(pos_axis); + squeeze_layer->mutable_squeeze()->set_squeezeall(false); + *squeeze_layer->mutable_input()->Add() = gather_out_name; + *squeeze_layer->mutable_output()->Add() = output_def.Name(); + model_builder.AddLayer(std::move(squeeze_layer)); + } + } } return Status::OK(); } @@ -87,14 +181,45 @@ bool GatherOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa return false; } - // Don't allow scalar 'indices' input. - // We convert scalar inputs to tensors with shape [1] before providing them to CoreML. - // This modification changes the shape of the Gather output. - if (indices_shape.empty()) { - LOGS(logger, VERBOSE) << "Gather does not support scalar 'indices'"; + // ONNX Gather schema constrains indices to int32 or int64. Validate here so + // AddToModelBuilderImpl can trust the dtype rather than silently defaulting + // on an unexpected value. + int32_t indices_dtype{}; + if (!GetType(*node.InputDefs()[1], indices_dtype, logger)) { return false; } + if (indices_dtype != ONNX_NAMESPACE::TensorProto_DataType_INT32 && + indices_dtype != ONNX_NAMESPACE::TensorProto_DataType_INT64) { + LOGS(logger, VERBOSE) << "Gather 'indices' dtype [" << indices_dtype + << "] is not supported (expected INT32 or INT64)"; + return false; + } + + // For scalar indices we internally emit gather with promoted [1] indices + // then squeeze. That requires us to claim a static intermediate shape, so + // we only handle scalar indices when the data shape itself is fully + // static. (Dynamic-shape scalar Gather still falls back to CPU.) + if (indices_shape.empty()) { + if (!IsStaticShape(data_shape)) { + LOGS(logger, VERBOSE) << "Gather with scalar 'indices' requires static 'data' shape"; + return false; + } + // The pre-squeeze intermediate has the same rank as `data`. CoreML's + // compiler reports "Invalid rank: 6" when a rank-5 intermediate is + // produced via reshape+gather, even though rank-5 intermediates are + // accepted in other op chains. Cap scalar-indices Gather at data rank 4 + // until that compiler limit is lifted. + // + // TODO: re-test on newer macOS / CoreML versions; if Apple lifts the + // intermediate rank limit, this cap can be raised to 5 (matching the + // general Gather output-rank check below). + if (data_shape.size() > 4) { + LOGS(logger, VERBOSE) << "Gather with scalar 'indices' supports 'data' rank up to 4"; + return false; + } + } + // Output rank = data_rank + indices_rank - 1. The rank-5 limit applies. if (data_shape.size() + indices_shape.size() - 1 > 5) { LOGS(logger, VERBOSE) << "Gather does not support output with rank greater than 5"; return false; diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index b6e1545d6f319..0ff5f54a31159 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -240,9 +240,10 @@ TEST(CoreMLExecutionProviderTest, ArgMaxUnsupportedCastTest) { } TEST(CoreMLExecutionProviderTest, GatherWithScalarIndices) { - // For scalar inputs, the input shape is modified from [] -> [1] before passing the input to CoreML. - // This won't work for Gather because the output shape depends on the `indices` input shape which could be a scalar. - // Currently, we expect the CoreML EP to only take the Shape node in this graph (Gather -> Shape). + // The CoreML EP supports scalar 'indices' for Gather only when the 'data' input has a fully + // static shape (it needs to claim a static intermediate shape for the post-gather squeeze). + // This model's 'data' input is dynamic ([M, N, K]) so Gather still falls back to CPU and the + // CoreML EP only takes the Shape node. const auto model_file_name = ORT_TSTR("testdata/gather_with_scalar_indices_then_shape.onnx"); #if defined(__APPLE__) @@ -1164,6 +1165,589 @@ TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) { #endif } +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis1) { + // ai.onnx:Gather with rank-0 (scalar) 'indices'. ONNX output rank = + // data_rank + indices_rank - 1 = 2. The CoreML builder internally promotes + // indices to [1], runs gather, then squeezes the inserted axis. Pattern + // produced by StyleGAN-family generators (e.g. GFPGAN) that pick a + // per-layer style code with a scalar index. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_axis1", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + // data X: {1, 4, 8} float + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(1); + data_shape->add_dim()->set_dim_value(4); + data_shape->add_dim()->set_dim_value(8); + + // output Y: {1, 8} + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + output_shape->add_dim()->set_dim_value(1); + output_shape->add_dim()->set_dim_value(8); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + // Scalar int64 index with value 2. + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + // No dims => rank-0 tensor. + idx_init.add_int64_data(2); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar", "Gather", "Gather with scalar indices", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(1)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {1, 4, 8}; + std::vector input_data(1 * 4 * 8); + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast(i) * 0.25f - 1.0f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis1_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis1_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesAxis0) { + // Scalar Gather along axis 0 — squeeze axis is 0; covers a different + // squeeze position than the axis=1 test. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_axis0", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + // data X: {6, 5} float + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(6); + data_shape->add_dim()->set_dim_value(5); + + // output Y: {5} + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + output_shape->add_dim()->set_dim_value(5); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(4); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_axis0", "Gather", "Gather scalar idx axis=0", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(0)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {6, 5}; + std::vector input_data(6 * 5); + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast(i) - 12.5f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis0_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesAxis0_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesNegativeAxis) { + // Scalar Gather with negative axis (-1) — verifies HandleNegativeAxis is + // applied when computing the squeeze axis. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_negative_axis", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + // data X: {2, 3, 4} float + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(2); + data_shape->add_dim()->set_dim_value(3); + data_shape->add_dim()->set_dim_value(4); + + // output Y: {2, 3} (axis=-1 == axis 2; output drops that axis) + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + output_shape->add_dim()->set_dim_value(2); + output_shape->add_dim()->set_dim_value(3); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(1); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_neg_axis", "Gather", "Gather scalar idx axis=-1", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(-1)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {2, 3, 4}; + std::vector input_data(2 * 3 * 4); + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast(i) * 0.5f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesNegativeAxis_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesNegativeAxis_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesFloat16) { + // FLOAT16 'data' input. HasSupportedInputsImpl restricts fp16 Gather to + // MLProgram on CoreML 6+, so this test only runs the MLProgram path. + // Exercises the MLFloat16 branch of the static intermediate shape claim. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_fp16", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(1); + data_shape->add_dim()->set_dim_value(4); + data_shape->add_dim()->set_dim_value(8); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + output_shape->add_dim()->set_dim_value(1); + output_shape->add_dim()->set_dim_value(8); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(2); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_fp16", "Gather", "Gather scalar idx fp16 data", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(1)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {1, 4, 8}; + std::vector input_data; + input_data.reserve(1 * 4 * 8); + for (size_t i = 0; i < 1 * 4 * 8; ++i) { + input_data.emplace_back(static_cast(i) * 0.25f - 1.0f); + } + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesFloat16_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesInt64Data) { + // INT64 'data' input. HasSupportedInputsImpl allows int64 in both NN and + // MLProgram; verify both formats correctly route int64 through the + // expand/gather/squeeze chain. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_int64_data", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(3); + data_shape->add_dim()->set_dim_value(4); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + output_shape->add_dim()->set_dim_value(4); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(1); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_int64", "Gather", "Gather scalar idx int64 data", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(0)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {3, 4}; + std::vector input_data; + input_data.reserve(3 * 4); + for (int64_t i = 0; i < 3 * 4; ++i) input_data.push_back(i * 1000 - 5000); + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt64Data_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt64Data_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesInt32Indices) { + // INT32 'indices'. The other scalar-indices tests use INT64 indices (the + // PyTorch default); this one exercises the INT32 branch through both the + // dtype gating in IsOpSupportedImpl and the indices_dtype path-through to + // the reshape's intermediate output dtype in AddToModelBuilderImpl. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_int32_indices", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_value(3); + data_shape->add_dim()->set_dim_value(4); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(4); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT32); + idx_init.add_int32_data(2); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_int32_idx", "Gather", "Gather scalar int32 idx", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(0)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {3, 4}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt32Indices_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesInt32Indices_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesRank4Data) { + // Rank-4 'data' input — the supported maximum for scalar Gather (the + // pre-squeeze intermediate is rank 4; CoreML's compiler rejects scalar + // Gather at rank 5 with "Invalid rank: 6"). Output is rank 3. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_rank4", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + for (int64_t d : {2, 5, 3, 4}) data_shape->add_dim()->set_dim_value(d); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + // Gather on axis=1 with scalar idx removes that axis: {2,3,4} + for (int64_t d : {2, 3, 4}) output_shape->add_dim()->set_dim_value(d); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(3); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_rank4", "Gather", "Gather scalar idx rank-4 data", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(1)); + + ASSERT_STATUS_OK(graph.Resolve()); + +#if defined(__APPLE__) + std::vector dims = {2, 5, 3, 4}; + std::vector input_data(2 * 5 * 3 * 4); + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast(i) * 0.1f - 5.0f; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesRank4Data_NN", + MakeCoreMLExecutionProvider(), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); + RunAndVerifyOutputsWithEP(model_span, "GatherScalarIndicesRank4Data_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesDynamicDataNotSupported) { + // The scalar-indices path emits a reshape-+squeeze chain whose intermediate + // shape we have to claim statically. IsOpSupportedImpl rejects the node + // when 'data' has any unknown dim so it falls back to CPU rather than + // produce an ill-formed CoreML program. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_dynamic_data", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + data_shape->add_dim()->set_dim_param("N"); // dynamic leading dim + data_shape->add_dim()->set_dim_value(4); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("N"); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(0); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_dyn", "Gather", "Gather scalar idx, dynamic data", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(1)); + + ASSERT_STATUS_OK(graph.Resolve()); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None); +} + +TEST(CoreMLExecutionProviderTest, GatherScalarIndicesRank5DataNotSupported) { + // Scalar-indices Gather caps data rank at 4 (CoreML compiler reports + // "Invalid rank: 6" on the rank-5 reshape+gather intermediate). Rank-5 + // 'data' must fall back to CPU. + std::unordered_map domain_to_version{{kOnnxDomain, 13}}; + onnxruntime::Model model("gather_scalar_indices_rank5", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + ONNX_NAMESPACE::TypeProto data_type; + data_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* data_shape = data_type.mutable_tensor_type()->mutable_shape(); + for (int64_t d : {2, 3, 4, 5, 6}) data_shape->add_dim()->set_dim_value(d); + + ONNX_NAMESPACE::TypeProto output_type; + output_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto* output_shape = output_type.mutable_tensor_type()->mutable_shape(); + // axis=2 with scalar idx removes that axis: {2,3,5,6} + for (int64_t d : {2, 3, 5, 6}) output_shape->add_dim()->set_dim_value(d); + + auto& input_arg = graph.GetOrCreateNodeArg("X", &data_type); + auto& output_arg = graph.GetOrCreateNodeArg("Y", &output_type); + + ONNX_NAMESPACE::TensorProto idx_init; + idx_init.set_name("idx"); + idx_init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + idx_init.add_int64_data(2); + graph.AddInitializedTensor(idx_init); + auto& idx_arg = graph.GetOrCreateNodeArg("idx", nullptr); + + auto& node = graph.AddNode("gather_scalar_rank5", "Gather", "Gather scalar idx rank-5 data", + {&input_arg, &idx_arg}, {&output_arg}); + node.AddAttribute("axis", static_cast(2)); + + ASSERT_STATUS_OK(graph.Resolve()); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + TestModelLoad(model_span, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::None); + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::None); +} + namespace { // Build a single-node com.microsoft:FusedConv model for the tests below. // Input X is {1, 2, 4, 4}, weight W is {3, 2, 2, 2} (constant initializer, set