Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
/// <summary>
/// Key for using the ORT format model flatbuffer bytes directly for initializers.
/// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
/// Requires `session.use_ort_model_bytes_directly` to be true.
/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true.
/// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
/// duration of the InferenceSession.
/// </summary>
static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
"session.use_ort_model_bytes_for_initializers";

/// <summary>
/// Key for using memory-mapped I/O to load ORT format model files.
/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
/// to load the .ort model file instead of reading it into a heap-allocated buffer.
/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
/// although the mapping must remain valid and model weights will be immutable.
/// The model load will fail if the mapping fails; fallbacks should be caller-handled.
/// </summary>
static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";

// This should only be specified when exporting an ORT format model for use on a different platform.
// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
// Available since version 1.11.
Expand Down
15 changes: 14 additions & 1 deletion onnxruntime/core/platform/posix/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ limitations under the License.
#include <gsl/gsl>
#include "core/common/logging/logging.h"
#include "core/common/narrow.h"
#include "core/common/safeint.h"
#include "core/platform/scoped_resource.h"
#include "core/platform/EigenNonBlockingThreadPool.h"

Expand Down Expand Up @@ -430,9 +431,21 @@ class PosixEnv : public Env {
return Status::OK();
}

// Validate that the file is large enough for the requested mapping.
struct stat file_stat;
if (fstat(file_descriptor.Get(), &file_stat) != 0) {
return ReportSystemError("fstat", file_path);
}
const size_t requested_end = SafeInt<size_t>(offset) + length;
ORT_RETURN_IF(static_cast<size_t>(file_stat.st_size) < requested_end,
"File \"", file_path,
"\" is too small for the requested mapping (file size: ",
file_stat.st_size, " bytes, requested offset + length: ",
requested_end, " bytes).");

static const size_t page_size = narrow<size_t>(sysconf(_SC_PAGESIZE));
const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
const size_t mapped_length = SafeInt<size_t>(length) + static_cast<size_t>(offset_to_page);
const FileOffsetType mapped_offset = offset - offset_to_page;
void* const mapped_base =
mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);
Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/core/platform/windows/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
" - ", std::system_category().message(error_code));
}

// Validate that the file is large enough for the requested mapping.
LARGE_INTEGER actual_size;
if (!GetFileSizeEx(file_handle.get(), &actual_size)) {
const auto error_code = GetLastError();
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"GetFileSizeEx ", ToUTF8String(Basename(file_path)),
" fail, errcode = ", error_code,
" - ", std::system_category().message(error_code));
}
const size_t requested_end = SafeInt<size_t>(offset) + length;
ORT_RETURN_IF(static_cast<ULONGLONG>(actual_size.QuadPart) < requested_end,
"File ", ToUTF8String(Basename(file_path)),
" is too small for the requested mapping (file size: ",
actual_size.QuadPart, " bytes, requested offset + length: ",
requested_end, " bytes).");

wil::unique_hfile file_mapping_handle{
CreateFileMappingW(file_handle.get(),
nullptr,
Expand Down
36 changes: 34 additions & 2 deletions onnxruntime/core/session/inference_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
return Status::OK();
}

static Status LoadOrtModelBytesMapped(const PathString& model_uri,
gsl::span<const uint8_t>& bytes,
Env::MappedMemoryPtr& mapped_memory) {
size_t num_bytes = 0;
ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
Comment thread
yuslepukhin marked this conversation as resolved.
ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri));

ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));

Comment thread
yuslepukhin marked this conversation as resolved.
bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);
Comment thread
yuslepukhin marked this conversation as resolved.

return Status::OK();
}

Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
return LoadOrtModelWithLoader(
[&]() {
model_location_ = model_uri;

const auto& config_options = GetSessionOptions().config_options;
const bool use_mmap =
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";

if (use_mmap) {
ORT_RETURN_IF_ERROR(
LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
return Status::OK();
}

ORT_RETURN_IF_ERROR(
LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
return Status::OK();
Comment thread
yuslepukhin marked this conversation as resolved.
Expand All @@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) {
return LoadOrtModelWithLoader([&]() {
const auto& config_options = GetSessionOptions().config_options;

if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") {
LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer.";
Comment thread
yuslepukhin marked this conversation as resolved.
}

const auto use_ort_model_bytes_directly =
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1";

Expand Down Expand Up @@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");

// if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
// provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
// will be empty.
// provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
// ort_format_model_bytes_data_holder_ will be empty.
// if that is the case we also allow creating initializers that directly use those bytes.
const auto& config_options = session_options_.config_options;
using_ort_model_bytes_for_initializers_ =
Expand Down Expand Up @@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() {
if (!using_ort_model_bytes_for_initializers_) {
ort_format_model_bytes_ = gsl::span<const uint8_t>();
std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
ort_format_model_mapped_memory_.reset();
}

// once the model is saved, we may remove unnecessary attributes for inference
Expand Down
9 changes: 8 additions & 1 deletion onnxruntime/core/session/inference_session.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "core/common/path_string.h"
#include "core/common/profiler.h"
#include "core/common/status.h"
#include "core/platform/env.h"
#include "core/framework/execution_providers.h"
#include "core/framework/framework_common.h"
#include "core/framework/iexecutor.h"
Expand Down Expand Up @@ -1025,6 +1026,8 @@ class InferenceSession {
// We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
// behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
// and some for the Initialize (create SessionState).
// If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
// mapping in ort_format_model_mapped_memory_.
// Short term we free them after Initialize.
// Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
// those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
Expand All @@ -1033,9 +1036,13 @@ class InferenceSession {
// This holds the actual model data
// In case if the session is started with an input byte array contains model data, and the caller
// specifies that ORT should use the model bytes directly by setting the session config option
// "session.use_ort_model_bytes_directly" to "1", this will be empty
// "session.use_ort_model_bytes_directly" to "1", this will be empty.
// Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
std::vector<uint8_t> ort_format_model_bytes_data_holder_;

// Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
Env::MappedMemoryPtr ort_format_model_mapped_memory_;

bool using_ort_model_bytes_for_initializers_{false};

// Container to store pre-packed weights to share between sessions.
Expand Down
37 changes: 34 additions & 3 deletions onnxruntime/test/framework/ort_model_only_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct OrtModelTestInfo {
bool run_use_buffer{false};
bool disable_copy_ort_buffer{false};
bool use_buffer_for_initializers{false};
bool use_memory_mapped_load{false};
TransformerLevel optimization_level = TransformerLevel::Level3;
};

Expand All @@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {

if (test_info.disable_copy_ort_buffer) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
}

if (test_info.use_buffer_for_initializers) {
Comment thread
yuslepukhin marked this conversation as resolved.
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
}
if (test_info.use_memory_mapped_load) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
}

if (test_info.use_buffer_for_initializers &&
(test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
}

so.graph_optimization_level = test_info.optimization_level;
Expand Down Expand Up @@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
RunOrtModel(test_info);
}

// Load the model from a file using memory-mapped I/O
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
test_info.use_memory_mapped_load = true;
RunOrtModel(test_info);
}

// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
test_info.use_memory_mapped_load = true;
test_info.use_buffer_for_initializers = true;
RunOrtModel(test_info);
}

// Verify that mmap loading fails gracefully on a non-existent file
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) {
SessionOptions so;
so.session_logid = "MemoryMappedMissingFile";
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
InferenceSessionWrapper session_object{so, GetEnvironment()};
auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort"));
ASSERT_FALSE(status.IsOK());
}

// regression test for 2 issues covered by PR #17000 (internally reported issue).
// 1) allocation planner broke in minimal build when subgraph had no nodes.
// 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing
Expand Down
10 changes: 10 additions & 0 deletions onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1,
"legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce "
"CPU/power density during the spin window. Values above 64 are clamped to 64.");
ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation,
"When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n"
"Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements.");
Comment thread
tianleiwu marked this conversation as resolved.
ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
Expand Down Expand Up @@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
// -n
test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);

// --hold_ms_after_session_creation
test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation);
if (test_config.run_config.hold_ms_after_session_creation > 0 &&
!test_config.run_config.exit_after_session_creation) {
fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n");
}

// -l
test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);

Expand Down
8 changes: 8 additions & 0 deletions onnxruntime/test/perftest/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

// onnxruntime dependencies
#include <core/session/onnxruntime_c_api.h>
#include <chrono>
#include <iostream>
#include <random>
#include <thread>
#include "command_args_parser.h"
#include "performance_runner.h"
#include "utils.h"
Expand Down Expand Up @@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi
// Exit if user enabled -n option so that user can measure session creation time
if (test_config.run_config.exit_after_session_creation) {
perf_runner.LogSessionCreationTime();
if (test_config.run_config.hold_ms_after_session_creation > 0) {
std::cout << "SESSION_READY" << std::endl;
std::this_thread::sleep_for(
std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation));
}
return 0;
}

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/test/perftest/test_configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ struct RunConfig {
int spin_backoff_max = 1; // 1 means no backoff (default)
bool spin_backoff_max_set = false;
bool exit_after_session_creation = false;
uint32_t hold_ms_after_session_creation{0};
std::basic_string<ORTCHAR_T> register_custom_op_path;
bool enable_cuda_io_binding{false};
bool use_extensions = false;
Expand Down
10 changes: 10 additions & 0 deletions onnxruntime/test/platform/file_io_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {

// invalid - negative offset
ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());

// invalid - requested length exceeds file size
auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
ASSERT_FALSE(status.IsOK());
ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
}
}
#else
Expand Down Expand Up @@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {

// invalid - negative offset
ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));

// invalid - requested length exceeds file size
auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
ASSERT_FALSE(status.IsOK());
ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
}
}
#endif
Expand Down
Loading
Loading