-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProgram.cs
More file actions
110 lines (90 loc) · 4.99 KB
/
Program.cs
File metadata and controls
110 lines (90 loc) · 4.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
using System;
using System.Linq;
using System.Runtime.InteropServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.Cuda;
/*
* Example: Using ONNX Runtime with CUDA for Inference with Pre-Allocated GPU Memory
*
* This example demonstrates how to use ONNX Runtime with CUDA for running an inference session, where the input is provided
* as a raw float array (CPU memory), and the output tensor is pre-allocated in GPU memory using `cudaMalloc`.
*
* Key Highlights:
* 1. **GPU Memory Allocation**: The program uses `cudaMalloc` to allocate memory for the output tensor directly on the GPU.
* 2. **Input Tensor**: The input tensor is passed from CPU memory using a float array, wrapped as an `OrtValue` using the method `OrtValue.CreateTensorValueFromMemory`.
* 3. **Output Tensor**: The output tensor is created from pre-allocated GPU memory using `OrtValue.CreateTensorValueWithData`, which allows ONNX Runtime to write the inference results directly to GPU memory.
* 4. **Inference**: The ONNX Runtime session runs using the `Run` method, which takes the input and output `OrtValue` collections and processes the inference on the model.
* 5. **Copying Output from GPU to CPU**: After inference, `cudaMemcpy` is used to copy the output tensor from GPU memory back to CPU memory for display or further processing.
*
* This example is ideal for scenarios where you need to pre-allocate memory for performance reasons or have custom memory handling (e.g., in high-performance GPU-based applications).
*/
class Program
{
public enum cudaError : int
{
cudaSuccess = 0,
// ... other error codes can go here ...
}
public enum cudaMemcpyKind : int
{
cudaMemcpyHostToDevice = 1,
cudaMemcpyDeviceToHost = 2,
cudaMemcpyDeviceToDevice = 3,
cudaMemcpyDefault = 4
}
[DllImport("cudart", EntryPoint = "cudaMalloc")]
public static extern cudaError cudaMalloc(out IntPtr devPtr, int size);
[DllImport("cudart", EntryPoint = "cudaMemcpy")]
public static extern cudaError cudaMemcpy(float[] dst, IntPtr src, int size, cudaMemcpyKind kind);
[DllImport("cudart", EntryPoint = "cudaFree")]
public static extern int cudaFree(IntPtr devPtr);
static void Main(string[] args)
{
// Initialize session with CUDA execution provider
var sessionOptions = new SessionOptions();
sessionOptions.AppendExecutionProvider_CUDA(0); // Use GPU 0
// Pointer to GPU memory for output
IntPtr devicePtr;
cudaError cudaResult = cudaMalloc(out devicePtr, 4 * sizeof(float));
if (cudaResult != cudaError.cudaSuccess)
{
Console.WriteLine("Failed to allocate device memory: {0}", cudaResult);
return;
}
using var session = new InferenceSession("model.onnx", sessionOptions);
// Create input data (for example, a float array)
var input = new float[4] { 1.0f, 2.0f, 3.0f, 4.0f };
// Bind input data
var inputTensor = new DenseTensor<float>(input, new int[] { 4 });
OrtValue inputValue = OrtValue.CreateTensorValueFromMemory<float>(
OrtMemoryInfo.DefaultInstance, inputTensor.Buffer, new long[] { 4 });
// Create OrtMemoryInfo for CUDA (GPU) memory
using var ortMemoryInfo = new OrtMemoryInfo("Cuda", OrtAllocatorType.DeviceAllocator, 0, OrtMemType.Default);
// Create an OrtValue from the GPU memory (output tensor) using CreateTensorValueWithData
using var outputValue = OrtValue.CreateTensorValueWithData(
ortMemoryInfo, // Memory info for GPU
TensorElementType.Float, // Tensor element type
new long[] { 4 }, // Tensor shape (4 elements)
devicePtr, // Pointer to the GPU memory
sizeof(float) * 4 // Total buffer size in bytes (4 floats)
);
var inputNames = session.InputMetadata.Keys.ToArray();
var outputNames = session.OutputMetadata.Keys.ToArray();
// Run the inference session, using the pre-allocated GPU memory for output
session.Run(null, inputNames, new List<OrtValue> { inputValue }, outputNames, new List<OrtValue> { outputValue });
// At this point, the inference has completed, and the results will be in `devicePtr`
// If needed, copy the output back to the host (CPU)
float[] outputData = new float[4]; // Allocate a buffer to hold the output
cudaMemcpy(outputData, devicePtr, 4 * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost);
// Print the output values
Console.WriteLine("Inference output:");
foreach (var value in outputData)
{
Console.WriteLine(value);
}
cudaFree(devicePtr);
}
}