diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index e35e3c5753d36..99a75438bf704 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -1083,6 +1083,7 @@ def ortvalue_from_shape_and_type( device_type: str = "cpu", device_id: int = 0, vendor_id: int | OrtDeviceVendorId = -1, + memory_info: C.OrtMemoryInfo | None = None, ) -> OrtValue: """ Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type @@ -1092,8 +1093,31 @@ def ortvalue_from_shape_and_type( :param device_type: e.g. cpu, cuda, cann, cpu by default :param device_id: device id, e.g. 0 :param vendor_id: The device's PCI vendor id as an int or OrtDeviceVendorId. If provided, the device type should be "gpu" or "npu". + :param memory_info: An OrtMemoryInfo from an OrtEpDevice (e.g. via ep_device.memory_info(OrtDeviceMemoryType.HOST_ACCESSIBLE)). When provided, the allocator matching this memory info is used directly, which allows allocating HOST_ACCESSIBLE memory for zero-copy numpy interop. The device_type, device_id, and vendor_id parameters are ignored when memory_info is provided. """ + if memory_info is not None: + if device_type != "cpu" or device_id != 0 or vendor_id != -1: + warnings.warn( + "device_type, device_id, and vendor_id are ignored when memory_info is provided.", + stacklevel=2, + ) + if isinstance(element_type, int): + return cls( + C.OrtValue.ortvalue_from_shape_and_onnx_type_for_memory_info( + shape, + element_type, + memory_info, + ) + ) + return cls( + C.OrtValue.ortvalue_from_shape_and_type_for_memory_info( + shape, + element_type, + memory_info, + ) + ) + device = OrtDevice.make(device_type, device_id, vendor_id)._get_c_device() # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html). diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc index 168d57fc0827b..4f44fc327c59b 100644 --- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc @@ -74,6 +74,29 @@ std::unique_ptr OrtValueFromShapeAndType(const std::vector& s Tensor::InitOrtValue(element_type, gsl::make_span(shape), std::move(allocator), *ml_value); return ml_value; } + +// Allocate an OrtValue using the shared allocator matching the given OrtMemoryInfo. +// This allows callers to specify the exact memory type (e.g. HOST_ACCESSIBLE) rather than +// relying on OrtDevice.make() which always uses DEFAULT. +// +// Uses the full OrtMemoryInfo for the lookup (including mem_type) rather than just the OrtDevice, +// because the registered allocator's OrtMemoryInfo has a specific mem_type (e.g. OrtMemTypeCPU +// for HOST_ACCESSIBLE) that must match for FindExistingAllocator to succeed. +std::unique_ptr OrtValueFromShapeAndTypeWithMemoryInfo(const std::vector& shape, + MLDataType element_type, + const OrtMemoryInfo& memory_info) { + auto& env = GetOrtEnv()->GetEnvironment(); + AllocatorPtr allocator = env.GetRegisteredSharedAllocator(memory_info); + + if (!allocator) { + throw std::runtime_error("No shared allocator found for: " + memory_info.ToString()); + } + + auto ml_value = std::make_unique(); + Tensor::InitOrtValue(element_type, gsl::make_span(shape), std::move(allocator), *ml_value); + return ml_value; +} + } // namespace void addOrtValueMethods(pybind11::module& m) { @@ -289,6 +312,32 @@ void addOrtValueMethods(pybind11::module& m) { auto element_type = OnnxTypeToOnnxRuntimeTensorType(onnx_element_type); return OrtValueFromShapeAndType(shape, element_type, device); }) + // Factory methods to create an OrtValue using an OrtMemoryInfo to select the allocator. + // This enables allocation with a specific memory type (e.g. HOST_ACCESSIBLE) from plugin EPs. + .def_static("ortvalue_from_shape_and_type_for_memory_info", [](const std::vector& shape, py::object& numpy_element_type, const OrtMemoryInfo& memory_info) -> std::unique_ptr { + PyArray_Descr* dtype; + if (!PyArray_DescrConverter(numpy_element_type.ptr(), &dtype)) { + throw std::runtime_error("Not a valid numpy type"); + } + + int type_num = dtype->type_num; + Py_DECREF(dtype); + + if (!IsNumericNumpyType(type_num)) { + throw std::runtime_error("Creation of OrtValues is currently only supported from non-string numpy arrays"); + } + + auto element_type = NumpyTypeToOnnxRuntimeTensorType(type_num); + return OrtValueFromShapeAndTypeWithMemoryInfo(shape, element_type, memory_info); + }) + .def_static("ortvalue_from_shape_and_onnx_type_for_memory_info", [](const std::vector& shape, int32_t onnx_element_type, const OrtMemoryInfo& memory_info) -> std::unique_ptr { + if (onnx_element_type == onnx::TensorProto_DataType::TensorProto_DataType_STRING) { + throw std::runtime_error("Creation of OrtValues is currently only supported from non-string numpy arrays"); + } + + auto element_type = OnnxTypeToOnnxRuntimeTensorType(onnx_element_type); + return OrtValueFromShapeAndTypeWithMemoryInfo(shape, element_type, memory_info); + }) #if !defined(DISABLE_SPARSE_TENSORS) .def_static("ort_value_from_sparse_tensor", [](const PySparseTensor* py_sparse_tensor) -> std::unique_ptr { diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index f7be1717ffa4e..625c9b0e7d084 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -278,13 +278,12 @@ py::object GetPyObjFromTensor(const OrtValue& ort_value, return py::cast(result); } - const auto device_type = device.Type(); // Create a numpy array on top of the OrtValue memory, no copy, // but only when the tensor owns the buffer. When the tensor wraps external // memory (e.g. a numpy input array passed through as output), the buffer // lifetime is not tied to the OrtValue and zero-copy would create a // dangling pointer. See https://github.com/microsoft/onnxruntime/issues/21922 - if (device_type == OrtDevice::CPU) { + if (device.UsesCpuMemory()) { if (tensor.OwnsBuffer() || zero_copy_non_owning) { py::array result = PrimitiveTensorToNumpyOverOrtValue(ort_value); return py::cast(result); diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.cc b/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.cc index 6137b23111bf9..96f2d5d5f2acd 100644 --- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.cc +++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.cc @@ -19,7 +19,8 @@ ExampleEpFactory::ExampleEpFactory(const char* ep_name, ApiPtrs apis, const OrtL default_logger_{default_logger}, ep_name_{ep_name}, default_memory_info_{nullptr}, - readonly_memory_info_{nullptr} { + readonly_memory_info_{nullptr}, + host_accessible_memory_info_{nullptr} { ort_version_supported = ORT_API_VERSION; // set to the ORT version we were compiled with. GetName = GetNameImpl; GetVendor = GetVendorImpl; @@ -71,12 +72,12 @@ ExampleEpFactory::ExampleEpFactory(const char* ep_name, ApiPtrs apis, const OrtL // HOST_ACCESSIBLE memory example. use the non-CPU device type so it's clear which device the memory is also // accessible from. we infer from the type of HOST_ACCESSIBLE that it's CPU accessible. - auto host_accessible_memory_info = Ort::MemoryInfo{"ExampleEP GPU pinned", - OrtMemoryInfoDeviceType_GPU, - /*vendor*/ 0xBE57, /* device_id */ 0, - OrtDeviceMemoryType_HOST_ACCESSIBLE, - /*alignment*/ 0, - OrtAllocatorType::OrtDeviceAllocator}; + host_accessible_memory_info_ = Ort::MemoryInfo{"ExampleEP GPU pinned", + OrtMemoryInfoDeviceType_GPU, + /*vendor*/ 0xBE57, /* device_id */ 0, + OrtDeviceMemoryType_HOST_ACCESSIBLE, + /*alignment*/ 0, + OrtAllocatorType::OrtDeviceAllocator}; // Custom Op Domains custom_op_domains_[0] = Ort::CustomOpDomain{"test"}; custom_op_domains_[1] = Ort::CustomOpDomain{"test2"}; @@ -156,10 +157,11 @@ OrtStatus* ORT_API_CALL ExampleEpFactory::GetSupportedDevicesImpl(OrtEpFactory* } // register the allocator info required by the EP. - // registering OrtMemoryInfo for host accessible memory would be done in an additional call. // OrtReadOnlyAllocator + OrtDeviceMemoryType_DEFAULT allocator for use with initializers is optional. + // OrtDeviceMemoryType_HOST_ACCESSIBLE is also optional and exposes CPU-accessible memory on the EP device. RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, factory->default_memory_info_)); RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, factory->readonly_memory_info_)); + RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, factory->host_accessible_memory_info_)); ep_devices[num_ep_devices++] = ep_device; } @@ -244,8 +246,9 @@ OrtStatus* ORT_API_CALL ExampleEpFactory::CreateAllocatorImpl(OrtEpFactory* this bool is_default_allocator = memory_info == factory.default_memory_info_; bool is_readonly_allocator = memory_info == factory.readonly_memory_info_; + bool is_host_accessible_allocator = memory_info == factory.host_accessible_memory_info_; - if (!is_default_allocator && !is_readonly_allocator) { + if (!is_default_allocator && !is_readonly_allocator && !is_host_accessible_allocator) { return factory.ort_api.CreateStatus(ORT_INVALID_ARGUMENT, "INTERNAL ERROR! Unknown memory info provided to CreateAllocator. " "Value did not come directly from an OrtEpDevice returned by this factory."); @@ -261,9 +264,10 @@ OrtStatus* ORT_API_CALL ExampleEpFactory::CreateAllocatorImpl(OrtEpFactory* this // You are of course free to have completely different settings. // the read-only allocator is used for initializers. we don't need an arena for that. - if (is_readonly_allocator) { - auto read_only_allocator = std::make_unique(memory_info, factory); - *allocator = read_only_allocator.release(); + // host-accessible memory is also returned via a plain non-arena allocator. + if (is_readonly_allocator || is_host_accessible_allocator) { + auto simple_allocator = std::make_unique(memory_info, factory); + *allocator = simple_allocator.release(); return nullptr; } diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.h b/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.h index 91478047afb0a..ce2d1605a2528 100644 --- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.h +++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_factory.h @@ -114,6 +114,7 @@ class ExampleEpFactory : public OrtEpFactory, public ApiPtrs { // CPU allocator so we can control the arena behavior. optional as ORT always provides a CPU allocator if needed. Ort::MemoryInfo default_memory_info_; Ort::MemoryInfo readonly_memory_info_; // used for initializers + Ort::MemoryInfo host_accessible_memory_info_; bool arena_allocator_using_default_settings_{true}; std::unique_ptr arena_allocator_; // shared device allocator that uses an arena diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py index a24269a312e9b..4aef416e9b918 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py +++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py @@ -9,6 +9,7 @@ from collections.abc import Sequence import numpy as np +import onnx from autoep_helper import AutoEpTestCase from helper import get_name @@ -23,6 +24,8 @@ class TestAutoEP(AutoEpTestCase): + EXAMPLE_EP_NAME = "example_ep" + def test_cuda_ep_register_and_inference(self): """ Test registration of CUDA EP, adding its OrtDevice to the SessionOptions, and running inference. @@ -341,6 +344,145 @@ def test_copy_tensors(self): self.unregister_execution_provider_library(ep_name) + def _register_example_plugin_ep_or_skip(self): + """Register the example plugin EP and return its OrtEpDevice, or skip the test.""" + if sys.platform != "win32": + self.skipTest("Skipping test because device discovery is only supported on Windows") + + try: + ep_lib_path = get_name("example_plugin_ep.dll") + except FileNotFoundError: + self.skipTest("Skipping test because example_plugin_ep.dll cannot be found") + + self.register_execution_provider_library(self.EXAMPLE_EP_NAME, os.path.realpath(ep_lib_path)) + + ep_device = next( + (d for d in onnxrt.get_ep_devices() if d.ep_name == self.EXAMPLE_EP_NAME), + None, + ) + self.assertIsNotNone(ep_device, f"Could not find OrtEpDevice for registered EP '{self.EXAMPLE_EP_NAME}'") + return ep_device + + def test_ortvalue_from_shape_and_type_host_accessible_numpy_dtype(self): + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE) + self.assertIsNotNone(mem_info) + + ort_value = onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, memory_info=mem_info) + + self.assertEqual(ort_value.shape(), [3, 2]) + self.assertEqual(ort_value.data_type(), "tensor(float)") + # The example EP advertises HOST_ACCESSIBLE on a fake GPU device, so the allocator + # came from memory_info rather than the default CPU path. + self.assertNotEqual(ort_value.device_name().lower(), "cpu") + + result = ort_value.numpy() + self.assertEqual(result.shape, (3, 2)) + self.assertEqual(result.dtype, np.float32) + + del ort_value + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + + def test_ortvalue_from_shape_and_type_host_accessible_onnx_int_type(self): + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE) + + ort_value = onnxrt.OrtValue.ortvalue_from_shape_and_type( + [4], onnx.TensorProto.FLOAT, memory_info=mem_info + ) + + self.assertEqual(ort_value.shape(), [4]) + self.assertEqual(ort_value.data_type(), "tensor(float)") + self.assertEqual(ort_value.numpy().dtype, np.float32) + + del ort_value + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + + def test_ortvalue_host_accessible_zero_copy_numpy_view(self): + # Writing through view1 must be visible through view2 - if numpy() ever copies, + # this test fails and the UsesCpuMemory() zero-copy guarantee has regressed. + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE) + + ort_value = onnxrt.OrtValue.ortvalue_from_shape_and_type([2, 3], np.float32, memory_info=mem_info) + ort_value.numpy().fill(7.5) + np.testing.assert_array_equal(ort_value.numpy(), np.full((2, 3), 7.5, dtype=np.float32)) + + cpu_value = onnxrt.OrtValue.ortvalue_from_shape_and_type([2, 3], np.float32) + cpu_value.numpy().fill(-1.25) + np.testing.assert_array_equal(cpu_value.numpy(), np.full((2, 3), -1.25, dtype=np.float32)) + + del ort_value + del cpu_value + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + + def test_ortvalue_from_shape_and_type_memory_info_no_allocator(self): + bogus_mem_info = onnxrt.OrtMemoryInfo.create_v2( + "Bogus", + onnxrt.OrtMemoryInfoDeviceType.GPU, + 0xDEAD, + 0, + onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE, + 0, + onnxrt.OrtAllocatorType.ORT_DEVICE_ALLOCATOR, + ) + + with self.assertRaisesRegex(RuntimeError, "No shared allocator found"): + onnxrt.OrtValue.ortvalue_from_shape_and_type([2], np.float32, memory_info=bogus_mem_info) + + def test_ortvalue_from_shape_and_onnx_type_memory_info_string_rejected(self): + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE) + + with self.assertRaisesRegex(RuntimeError, "non-string numpy arrays"): + onnxrt.OrtValue.ortvalue_from_shape_and_type( + [2], onnx.TensorProto.STRING, memory_info=mem_info + ) + + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + + def test_ortvalue_from_shape_and_type_memory_info_overrides_device_args(self): + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.HOST_ACCESSIBLE) + + # Bogus device args alongside a valid memory_info: if the wrapper ever stops ignoring + # them, this would fail (unknown device) or silently allocate elsewhere. + ort_value = onnxrt.OrtValue.ortvalue_from_shape_and_type( + [3], + np.float32, + device_type="cuda", + device_id=99, + vendor_id=0xFFFF, + memory_info=mem_info, + ) + + ort_value_baseline = onnxrt.OrtValue.ortvalue_from_shape_and_type([3], np.float32, memory_info=mem_info) + self.assertEqual(ort_value.device_name(), ort_value_baseline.device_name()) + + del ort_value + del ort_value_baseline + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + + def test_ortvalue_from_shape_and_type_default_memory_info(self): + # Pins the false-branch of UsesCpuMemory(): DEFAULT memory on a non-CPU device must + # round-trip through data_transfer rather than the zero-copy view path. + ep_device = self._register_example_plugin_ep_or_skip() + mem_info = ep_device.memory_info(onnxrt.OrtDeviceMemoryType.DEFAULT) + self.assertIsNotNone(mem_info) + + ort_value = onnxrt.OrtValue.ortvalue_from_shape_and_type([2, 3], np.float32, memory_info=mem_info) + + self.assertEqual(ort_value.shape(), [2, 3]) + self.assertEqual(ort_value.data_type(), "tensor(float)") + self.assertNotEqual(ort_value.device_name().lower(), "cpu") + + arr = ort_value.numpy() + self.assertEqual(arr.shape, (2, 3)) + self.assertEqual(arr.dtype, np.float32) + + del ort_value + self.unregister_execution_provider_library(self.EXAMPLE_EP_NAME) + if __name__ == "__main__": unittest.main(verbosity=1)