Skip to content

Commit 6974462

Browse files
committed
- Simplified PrepareKDTreeData to return only KDPredictions, removing KDpoints for better memory management.
- Updated `_reusableInputArray` initialization to reset tensors when the buffer changes. - Removed static import of `AIManager` in `MathUtil.cs` for better decoupling. - Added a lookup table for byte-to-float conversion to improve performance. - Heavily Optimized `BitmapToFloatArrayInPlace` using unsafe code and batch processing to reduce garbage collection pressure and enhance efficiency.
1 parent a6ca6e9 commit 6974462

File tree

2 files changed

+111
-45
lines changed

2 files changed

+111
-45
lines changed

Aimmy2/AILogic/AIManager.cs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,9 @@ private void HandlePredictions(KalmanPrediction kalmanPrediction, Prediction clo
618618
if (_reusableInputArray == null || _reusableInputArray.Length != requiredLength)
619619
{
620620
_reusableInputArray = new float[requiredLength];
621+
// Ensure we force _reusableTensor to be recreated below (since buffer changed)
622+
_reusableTensor = null;
623+
_reusableInputs = null;
621624
}
622625
inputArray = _reusableInputArray;
623626

@@ -665,14 +668,14 @@ private void HandlePredictions(KalmanPrediction kalmanPrediction, Prediction clo
665668
float fovMinY = (IMAGE_SIZE - FovSize) / 2.0f;
666669
float fovMaxY = (IMAGE_SIZE + FovSize) / 2.0f;
667670

668-
List<double[]> KDpoints;
671+
//List<double[]> KDpoints;
669672
List<Prediction> KDPredictions;
670673
using (Benchmark("PrepareKDTreeData"))
671674
{
672-
(KDpoints, KDPredictions) = PrepareKDTreeData(outputTensor, detectionBox, fovMinX, fovMaxX, fovMinY, fovMaxY);
675+
KDPredictions = PrepareKDTreeData(outputTensor, detectionBox, fovMinX, fovMaxX, fovMinY, fovMaxY);
673676
}
674677

675-
if (KDpoints.Count == 0 || KDPredictions.Count == 0)
678+
if (KDPredictions.Count == 0)
676679
{
677680
SaveFrame(frame);
678681
return null;
@@ -778,7 +781,7 @@ private void UpdateDetectionBox(Prediction target, Rectangle detectionBox)
778781
CenterXTranslated = target.CenterXTranslated;
779782
CenterYTranslated = target.CenterYTranslated;
780783
}
781-
private (List<double[]>, List<Prediction>) PrepareKDTreeData(
784+
private List<Prediction> PrepareKDTreeData(
782785
Tensor<float> outputTensor,
783786
Rectangle detectionBox,
784787
float fovMinX, float fovMaxX, float fovMinY, float fovMaxY)
@@ -787,7 +790,7 @@ private void UpdateDetectionBox(Prediction target, Rectangle detectionBox)
787790
string selectedClass = Dictionary.dropdownState["Target Class"];
788791
int selectedClassId = selectedClass == "Best Confidence" ? -1 : _modelManager.modelClasses.FirstOrDefault(c => c.Value == selectedClass).Key;
789792

790-
var KDpoints = new List<double[]>(_modelManager.NUM_DETECTIONS); // Pre-allocate with estimated capacity
793+
//var KDpoints = new List<double[]>(_modelManager.NUM_DETECTIONS); // Pre-allocate with estimated capacity
791794
var KDpredictions = new List<Prediction>(_modelManager.NUM_DETECTIONS);
792795

793796
for (int i = 0; i < _modelManager.NUM_DETECTIONS; i++)
@@ -841,19 +844,17 @@ private void UpdateDetectionBox(Prediction target, Rectangle detectionBox)
841844
Confidence = bestConfidence,
842845
ClassId = bestClassId,
843846
ClassName = _modelManager.modelClasses.GetValueOrDefault(bestClassId, $"Class_{bestClassId}"),
844-
CenterXTranslated = x_center / IMAGE_SIZE, // !! CenterXTranslated is normalized to [0, 1]
847+
CenterXTranslated = x_center / IMAGE_SIZE,
845848
CenterYTranslated = y_center / IMAGE_SIZE,
846-
//CenterXTranslated = (x_center - detectionBox.Left) / IMAGE_SIZE,
847-
//CenterYTranslated = (y_center - detectionBox.Top) / IMAGE_SIZE,
848849
ScreenCenterX = detectionBox.Left + x_center,
849850
ScreenCenterY = detectionBox.Top + y_center
850851
};
851852

852-
KDpoints.Add(new double[] { x_center, y_center });
853+
//KDpoints.Add(new double[] { x_center, y_center });
853854
KDpredictions.Add(prediction);
854855
}
855856

856-
return (KDpoints, KDpredictions);
857+
return KDpredictions;
857858
}
858859

859860
#endregion AI Loop Functions

Aimmy2/AILogic/MathUtil.cs

Lines changed: 100 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
using System.Drawing;
22
using System.Drawing.Imaging;
33
using System.Runtime.CompilerServices;
4-
using static Aimmy2.AILogic.AIManager;
54

65
namespace Aimmy2.AILogic
76
{
@@ -31,7 +30,7 @@ public static float Distance(Prediction a, Prediction b)
3130
{
3231
float dx = a.ScreenCenterX - b.ScreenCenterX;
3332
float dy = a.ScreenCenterY - b.ScreenCenterY;
34-
return dx * dx + dy * dy;
33+
return dx * dx + dy * dy;
3534
}
3635
[MethodImpl(MethodImplOptions.AggressiveInlining)]
3736
public static float DistanceInImageCoords(Prediction a, Prediction b, int imageSize)
@@ -40,60 +39,126 @@ public static float DistanceInImageCoords(Prediction a, Prediction b, int imageS
4039
float dy = (a.CenterYTranslated - b.CenterYTranslated) * imageSize;
4140
return dx * dx + dy * dy;
4241
}
42+
43+
// LUT = look up table
44+
// REFERENCE: https://stackoverflow.com/questions/1089235/where-can-i-find-a-byte-to-float-lookup-table
45+
// "In this case, the lookup table should be faster than using direct calculation. The more complex the math (trigonometry, etc.), the bigger the performance gain."
46+
// although we used small calculations, something is better than nothing.
47+
private static readonly float[] _byteToFloatLut = CreateByteToFloatLut();
48+
private static float[] CreateByteToFloatLut()
49+
{
50+
var lut = new float[256];
51+
for (int i = 0; i < 256; i++)
52+
lut[i] = i / 255f;
53+
return lut;
54+
}
55+
56+
// this new function reduces gc pressure as i stopped using array.copy
57+
// REFERENCE: https://www.codeproject.com/Articles/617613/Fast-Pixel-Operations-in-NET-With-and-Without-unsa
4358
public static unsafe void BitmapToFloatArrayInPlace(Bitmap image, float[] result, int IMAGE_SIZE)
4459
{
60+
if (image == null) throw new ArgumentNullException(nameof(image));
61+
if (result == null) throw new ArgumentNullException(nameof(result));
62+
4563
int width = IMAGE_SIZE;
4664
int height = IMAGE_SIZE;
4765
int totalPixels = width * height;
48-
const float multiplier = 1f / 255f;
4966

67+
// check if it has the right size
68+
if (result.Length != 3 * totalPixels)
69+
throw new ArgumentException($"result must be length {3 * totalPixels}", nameof(result));
70+
71+
//const float multiplier = 1f / 255f; kept for reference
5072
var rect = new Rectangle(0, 0, width, height);
51-
var bmpData = image.LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format24bppRgb); //3 bytes per pixel
52-
// blue green red (strict order)
73+
74+
// Lock the bitmap
75+
var bmpData = image.LockBits(rect, ImageLockMode.ReadOnly, image.PixelFormat);
5376
try
5477
{
55-
int stride = bmpData.Stride;
5678
byte* basePtr = (byte*)bmpData.Scan0;
57-
int redOffset = 0;
58-
int greenOffset = totalPixels;
59-
int blueOffset = 2 * totalPixels;
6079

61-
//for each row in the image -> create a temporary array for red green and blue (rgb)
62-
Parallel.For(0, height, () => (localR: new float[width], localG: new float[width], localB: new float[width]),
63-
(y, state, local) =>
80+
//handle negative stride, topdown vs bottomup
81+
int stride = Math.Abs(bmpData.Stride);
82+
83+
84+
// array offsets for the three color channels
85+
// 32gbpp format is hardcoded but 24bpp is just 3 bytes per pixel
86+
const int bytesPerPixel = 4;
87+
const int pixelsPerIteration = 4; // process 4 pixels at a time
88+
89+
int rOffset = 0; // Red channel starts at index 0
90+
int gOffset = totalPixels; // Green channel starts after red
91+
int bOffset = totalPixels * 2; // Blue channel starts after green
92+
93+
// prevent gc from moving the array while we are using it
94+
fixed (float* dest = result)
6495
{
65-
byte* row = basePtr + (y * stride);
96+
float* rPtr = dest + rOffset; //pointers to the start of each channel
97+
float* gPtr = dest + gOffset; //variables are arranged in RGB but its actually BGR.
98+
float* bPtr = dest + bOffset;
6699

67-
// process entire row in local buffers
68-
for (int x = 0; x < width; x++)
100+
// process rows in parallel
101+
Parallel.For(0, height, (y) =>
69102
{
70-
int bufferIndex = x * 3;
71-
// BGR byte order: +2 = R, +1 = G, +0 = B
72-
// B = 0
73-
// G = 1
74-
// R = 2
75-
// (bufferIndex + x)
76-
local.localR[x] = row[bufferIndex + 2] * multiplier;
77-
local.localG[x] = row[bufferIndex + 1] * multiplier;
78-
local.localB[x] = row[bufferIndex] * multiplier;
79-
}
80-
81-
// after processing the row copy the results into the final array
82-
int rowStart = y * width;
83-
Array.Copy(local.localR, 0, result, redOffset + rowStart, width);
84-
Array.Copy(local.localG, 0, result, greenOffset + rowStart, width);
85-
Array.Copy(local.localB, 0, result, blueOffset + rowStart, width);
86-
87-
return local;
88-
},
89-
_ => { });
103+
byte* row = basePtr + (long)y * stride;
104+
int rowStart = y * width;
105+
int x = 0;
106+
107+
int widthLimit = width - pixelsPerIteration + 1;
108+
// optimize for 4 pixels at a time
109+
// to remove loop overhead and (cache (?))
110+
for (; x < widthLimit; x += pixelsPerIteration)
111+
{
112+
int baseIdx = rowStart + x;
113+
byte* p = row + (x * bytesPerPixel);
114+
115+
// bgr(a) values
116+
// windows bitmap uses BGR order
117+
118+
// process 1st pixel / pixel 0 (16bytes)
119+
bPtr[baseIdx] = _byteToFloatLut[p[0]];
120+
gPtr[baseIdx] = _byteToFloatLut[p[1]];
121+
rPtr[baseIdx] = _byteToFloatLut[p[2]];
122+
//alpha is ignored
123+
124+
// pixel 1
125+
bPtr[baseIdx + 1] = _byteToFloatLut[p[4]];
126+
gPtr[baseIdx + 1] = _byteToFloatLut[p[5]];
127+
rPtr[baseIdx + 1] = _byteToFloatLut[p[6]];
128+
// pixel 2
129+
bPtr[baseIdx + 2] = _byteToFloatLut[p[8]];
130+
gPtr[baseIdx + 2] = _byteToFloatLut[p[9]];
131+
rPtr[baseIdx + 2] = _byteToFloatLut[p[10]];
132+
// pixel 3
133+
bPtr[baseIdx + 3] = _byteToFloatLut[p[12]];
134+
gPtr[baseIdx + 3] = _byteToFloatLut[p[13]];
135+
rPtr[baseIdx + 3] = _byteToFloatLut[p[14]];
136+
137+
p += 16; // move pointer 16 bytes forward (4 pixels * 4 bytes per pixel)
138+
}
139+
140+
// handle the rest of the pixels when width is not divisible by 4
141+
for (; x < width; x++)
142+
{
143+
int idx = rowStart + x;
144+
byte* p = row + (x * bytesPerPixel);
145+
146+
// process by BGR(a) value like before
147+
bPtr[idx] = _byteToFloatLut[p[0]];
148+
gPtr[idx] = _byteToFloatLut[p[1]];
149+
rPtr[idx] = _byteToFloatLut[p[2]];
150+
}
151+
});
152+
}
90153
}
91154
finally
92155
{
156+
//unlock the bitmap finally
93157
image.UnlockBits(bmpData);
94158
}
95159
}
96160

97161

162+
98163
}
99164
}

0 commit comments

Comments
 (0)