11using System . Drawing ;
22using System . Drawing . Imaging ;
33using System . Runtime . CompilerServices ;
4- using static Aimmy2 . AILogic . AIManager ;
54
65namespace Aimmy2 . AILogic
76{
@@ -31,7 +30,7 @@ public static float Distance(Prediction a, Prediction b)
3130 {
3231 float dx = a . ScreenCenterX - b . ScreenCenterX ;
3332 float dy = a . ScreenCenterY - b . ScreenCenterY ;
34- return dx * dx + dy * dy ;
33+ return dx * dx + dy * dy ;
3534 }
3635 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
3736 public static float DistanceInImageCoords ( Prediction a , Prediction b , int imageSize )
@@ -40,60 +39,126 @@ public static float DistanceInImageCoords(Prediction a, Prediction b, int imageS
4039 float dy = ( a . CenterYTranslated - b . CenterYTranslated ) * imageSize ;
4140 return dx * dx + dy * dy ;
4241 }
42+
43+ // LUT = look up table
44+ // REFERENCE: https://stackoverflow.com/questions/1089235/where-can-i-find-a-byte-to-float-lookup-table
45+ // "In this case, the lookup table should be faster than using direct calculation. The more complex the math (trigonometry, etc.), the bigger the performance gain."
46+ // although we used small calculations, something is better than nothing.
47+ private static readonly float [ ] _byteToFloatLut = CreateByteToFloatLut ( ) ;
48+ private static float [ ] CreateByteToFloatLut ( )
49+ {
50+ var lut = new float [ 256 ] ;
51+ for ( int i = 0 ; i < 256 ; i ++ )
52+ lut [ i ] = i / 255f ;
53+ return lut ;
54+ }
55+
56+ // this new function reduces gc pressure as i stopped using array.copy
57+ // REFERENCE: https://www.codeproject.com/Articles/617613/Fast-Pixel-Operations-in-NET-With-and-Without-unsa
4358 public static unsafe void BitmapToFloatArrayInPlace ( Bitmap image , float [ ] result , int IMAGE_SIZE )
4459 {
60+ if ( image == null ) throw new ArgumentNullException ( nameof ( image ) ) ;
61+ if ( result == null ) throw new ArgumentNullException ( nameof ( result ) ) ;
62+
4563 int width = IMAGE_SIZE ;
4664 int height = IMAGE_SIZE ;
4765 int totalPixels = width * height ;
48- const float multiplier = 1f / 255f ;
4966
67+ // check if it has the right size
68+ if ( result . Length != 3 * totalPixels )
69+ throw new ArgumentException ( $ "result must be length { 3 * totalPixels } ", nameof ( result ) ) ;
70+
71+ //const float multiplier = 1f / 255f; kept for reference
5072 var rect = new Rectangle ( 0 , 0 , width , height ) ;
51- var bmpData = image . LockBits ( rect , ImageLockMode . ReadOnly , PixelFormat . Format24bppRgb ) ; //3 bytes per pixel
52- // blue green red (strict order)
73+
74+ // Lock the bitmap
75+ var bmpData = image . LockBits ( rect , ImageLockMode . ReadOnly , image . PixelFormat ) ;
5376 try
5477 {
55- int stride = bmpData . Stride ;
5678 byte * basePtr = ( byte * ) bmpData . Scan0 ;
57- int redOffset = 0 ;
58- int greenOffset = totalPixels ;
59- int blueOffset = 2 * totalPixels ;
6079
61- //for each row in the image -> create a temporary array for red green and blue (rgb)
62- Parallel . For ( 0 , height , ( ) => ( localR : new float [ width ] , localG : new float [ width ] , localB : new float [ width ] ) ,
63- ( y , state , local ) =>
80+ //handle negative stride, topdown vs bottomup
81+ int stride = Math . Abs ( bmpData . Stride ) ;
82+
83+
84+ // array offsets for the three color channels
85+ // 32gbpp format is hardcoded but 24bpp is just 3 bytes per pixel
86+ const int bytesPerPixel = 4 ;
87+ const int pixelsPerIteration = 4 ; // process 4 pixels at a time
88+
89+ int rOffset = 0 ; // Red channel starts at index 0
90+ int gOffset = totalPixels ; // Green channel starts after red
91+ int bOffset = totalPixels * 2 ; // Blue channel starts after green
92+
93+ // prevent gc from moving the array while we are using it
94+ fixed ( float * dest = result )
6495 {
65- byte * row = basePtr + ( y * stride ) ;
96+ float * rPtr = dest + rOffset ; //pointers to the start of each channel
97+ float * gPtr = dest + gOffset ; //variables are arranged in RGB but its actually BGR.
98+ float * bPtr = dest + bOffset ;
6699
67- // process entire row in local buffers
68- for ( int x = 0 ; x < width ; x ++ )
100+ // process rows in parallel
101+ Parallel . For ( 0 , height , ( y ) =>
69102 {
70- int bufferIndex = x * 3 ;
71- // BGR byte order: +2 = R, +1 = G, +0 = B
72- // B = 0
73- // G = 1
74- // R = 2
75- // (bufferIndex + x)
76- local . localR [ x ] = row [ bufferIndex + 2 ] * multiplier ;
77- local . localG [ x ] = row [ bufferIndex + 1 ] * multiplier ;
78- local . localB [ x ] = row [ bufferIndex ] * multiplier ;
79- }
80-
81- // after processing the row copy the results into the final array
82- int rowStart = y * width ;
83- Array . Copy ( local . localR , 0 , result , redOffset + rowStart , width ) ;
84- Array . Copy ( local . localG , 0 , result , greenOffset + rowStart , width ) ;
85- Array . Copy ( local . localB , 0 , result , blueOffset + rowStart , width ) ;
86-
87- return local ;
88- } ,
89- _ => { } ) ;
103+ byte * row = basePtr + ( long ) y * stride ;
104+ int rowStart = y * width ;
105+ int x = 0 ;
106+
107+ int widthLimit = width - pixelsPerIteration + 1 ;
108+ // optimize for 4 pixels at a time
109+ // to remove loop overhead and (cache (?))
110+ for ( ; x < widthLimit ; x += pixelsPerIteration )
111+ {
112+ int baseIdx = rowStart + x ;
113+ byte * p = row + ( x * bytesPerPixel ) ;
114+
115+ // bgr(a) values
116+ // windows bitmap uses BGR order
117+
118+ // process 1st pixel / pixel 0 (16bytes)
119+ bPtr [ baseIdx ] = _byteToFloatLut [ p [ 0 ] ] ;
120+ gPtr [ baseIdx ] = _byteToFloatLut [ p [ 1 ] ] ;
121+ rPtr [ baseIdx ] = _byteToFloatLut [ p [ 2 ] ] ;
122+ //alpha is ignored
123+
124+ // pixel 1
125+ bPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 4 ] ] ;
126+ gPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 5 ] ] ;
127+ rPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 6 ] ] ;
128+ // pixel 2
129+ bPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 8 ] ] ;
130+ gPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 9 ] ] ;
131+ rPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 10 ] ] ;
132+ // pixel 3
133+ bPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 12 ] ] ;
134+ gPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 13 ] ] ;
135+ rPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 14 ] ] ;
136+
137+ p += 16 ; // move pointer 16 bytes forward (4 pixels * 4 bytes per pixel)
138+ }
139+
140+ // handle the rest of the pixels when width is not divisible by 4
141+ for ( ; x < width ; x ++ )
142+ {
143+ int idx = rowStart + x ;
144+ byte * p = row + ( x * bytesPerPixel ) ;
145+
146+ // process by BGR(a) value like before
147+ bPtr [ idx ] = _byteToFloatLut [ p [ 0 ] ] ;
148+ gPtr [ idx ] = _byteToFloatLut [ p [ 1 ] ] ;
149+ rPtr [ idx ] = _byteToFloatLut [ p [ 2 ] ] ;
150+ }
151+ } ) ;
152+ }
90153 }
91154 finally
92155 {
156+ //unlock the bitmap finally
93157 image . UnlockBits ( bmpData ) ;
94158 }
95159 }
96160
97161
162+
98163 }
99164}
0 commit comments