11using System . Drawing ;
22using System . Drawing . Imaging ;
3- using System . Numerics ;
43using System . Runtime . CompilerServices ;
4+ using System . Numerics ;
55
66namespace Aimmy2 . AILogic
77{
@@ -58,23 +58,19 @@ private static float[] CreateByteToFloatLut()
5858
5959 // this new function reduces gc pressure as i stopped using array.copy
6060 // REFERENCE: https://www.codeproject.com/Articles/617613/Fast-Pixel-Operations-in-NET-With-and-Without-unsa
61- public static unsafe void BitmapToFloatArrayInPlace ( Bitmap image , float [ ] result , int imageSize )
61+ public static unsafe void BitmapToFloatArrayInPlace ( Bitmap image , float [ ] result , int IMAGE_SIZE )
6262 {
6363 if ( image == null ) throw new ArgumentNullException ( nameof ( image ) ) ;
6464 if ( result == null ) throw new ArgumentNullException ( nameof ( result ) ) ;
6565
66- //assumes square so dont feed it non-square images
67- int width = imageSize ;
68- int height = imageSize ;
66+ int width = IMAGE_SIZE ;
67+ int height = IMAGE_SIZE ;
6968 int totalPixels = width * height ;
7069
7170 // check if it has the right size
7271 if ( result . Length != 3 * totalPixels )
7372 throw new ArgumentException ( $ "result must be length { 3 * totalPixels } ", nameof ( result ) ) ;
7473
75- if ( image . Width != width || image . Height != height )
76- throw new ArgumentException ( $ "Bitmap size ({ image . Width } x{ image . Height } ) does not match expected size { width } x{ height } .") ;
77-
7874 //const float multiplier = 1f / 255f; kept for reference
7975 var rect = new Rectangle ( 0 , 0 , width , height ) ;
8076
@@ -89,118 +85,74 @@ public static unsafe void BitmapToFloatArrayInPlace(Bitmap image, float[] result
8985 // 32gbpp format is hardcoded but 24bpp is just 3 bytes per pixel
9086 const int bytesPerPixel = 4 ;
9187 const int pixelsPerIteration = 4 ; // process 4 pixels at a time
92-
88+
9389 int rOffset = 0 ; // Red channel starts at index 0
9490 int gOffset = totalPixels ; // Green channel starts after red
9591 int bOffset = totalPixels * 2 ; // Blue channel starts after green
9692
97- bool useSequential = width <= 320 ; // (<=320 sequential)
98-
9993 // prevent gc from moving the array while we are using it
10094 fixed ( float * dest = result )
10195 {
10296 float * rPtr = dest + rOffset ; //pointers to the start of each channel
10397 float * gPtr = dest + gOffset ; //variables are arranged in RGB but its actually BGR.
10498 float * bPtr = dest + bOffset ;
105- if ( ! useSequential )
106- {
10799
108- // process rows in parallel
109- Parallel . For ( 0 , height , ( y ) =>
110- {
111- byte * row = basePtr + ( long ) y * stride ;
112- int rowStart = y * width ;
113- int x = 0 ;
114-
115- int widthLimit = width - pixelsPerIteration + 1 ;
116- // optimize for 4 pixels at a time
117- // to remove loop overhead and (cache (?))
118- for ( ; x < widthLimit ; x += pixelsPerIteration )
119- {
120- int baseIdx = rowStart + x ;
121- byte * p = row + ( x * bytesPerPixel ) ;
122-
123- // bgr(a) values
124- // windows bitmap uses BGR order
125-
126- // process 1st pixel / pixel 0 (16bytes)
127- bPtr [ baseIdx ] = _byteToFloatLut [ p [ 0 ] ] ;
128- gPtr [ baseIdx ] = _byteToFloatLut [ p [ 1 ] ] ;
129- rPtr [ baseIdx ] = _byteToFloatLut [ p [ 2 ] ] ;
130- //alpha is ignored
131-
132- // pixel 1
133- bPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 4 ] ] ;
134- gPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 5 ] ] ;
135- rPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 6 ] ] ;
136- // pixel 2
137- bPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 8 ] ] ;
138- gPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 9 ] ] ;
139- rPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 10 ] ] ;
140- // pixel 3
141- bPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 12 ] ] ;
142- gPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 13 ] ] ;
143- rPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 14 ] ] ;
144-
145- p += 16 ; // move pointer 16 bytes forward (4 pixels * 4 bytes per pixel)
146- }
147-
148- // handle the rest of the pixels when width is not divisible by 4
149- for ( ; x < width ; x ++ )
150- {
151- int idx = rowStart + x ;
152- byte * p = row + ( x * bytesPerPixel ) ;
153-
154- // process by BGR(a) value like before
155- bPtr [ idx ] = _byteToFloatLut [ p [ 0 ] ] ;
156- gPtr [ idx ] = _byteToFloatLut [ p [ 1 ] ] ;
157- rPtr [ idx ] = _byteToFloatLut [ p [ 2 ] ] ;
158- }
159- } ) ;
160- }
161- else
100+
101+ // TODO:
102+ // For common small IMAGE_SIZE (128/224/320) parallel would be slower ... so we will factor that in later.
103+
104+ // process rows in parallel
105+ Parallel . For ( 0 , height , ( y ) =>
162106 {
163- //handle it sequentially for small images (<=320 width) (120, 320, idk)
164- for ( int y = 0 ; y < height ; y ++ )
107+ byte * row = basePtr + ( long ) y * stride ;
108+ int rowStart = y * width ;
109+ int x = 0 ;
110+
111+ int widthLimit = width - pixelsPerIteration + 1 ;
112+ // optimize for 4 pixels at a time
113+ // to remove loop overhead and (cache (?))
114+ for ( ; x < widthLimit ; x += pixelsPerIteration )
115+ {
116+ int baseIdx = rowStart + x ;
117+ byte * p = row + ( x * bytesPerPixel ) ;
118+
119+ // bgr(a) values
120+ // windows bitmap uses BGR order
121+
122+ // process 1st pixel / pixel 0 (16bytes)
123+ bPtr [ baseIdx ] = _byteToFloatLut [ p [ 0 ] ] ;
124+ gPtr [ baseIdx ] = _byteToFloatLut [ p [ 1 ] ] ;
125+ rPtr [ baseIdx ] = _byteToFloatLut [ p [ 2 ] ] ;
126+ //alpha is ignored
127+
128+ // pixel 1
129+ bPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 4 ] ] ;
130+ gPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 5 ] ] ;
131+ rPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 6 ] ] ;
132+ // pixel 2
133+ bPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 8 ] ] ;
134+ gPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 9 ] ] ;
135+ rPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 10 ] ] ;
136+ // pixel 3
137+ bPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 12 ] ] ;
138+ gPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 13 ] ] ;
139+ rPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 14 ] ] ;
140+
141+ p += 16 ; // move pointer 16 bytes forward (4 pixels * 4 bytes per pixel)
142+ }
143+
144+ // handle the rest of the pixels when width is not divisible by 4
145+ for ( ; x < width ; x ++ )
165146 {
166- byte * row = basePtr + ( long ) y * stride ;
167- int rowStart = y * width ;
168- int x = 0 ;
169- int widthLimit = width - pixelsPerIteration + 1 ;
170-
171- for ( ; x < widthLimit ; x += pixelsPerIteration )
172- {
173- int baseIdx = rowStart + x ;
174- byte * p = row + ( x * bytesPerPixel ) ;
175-
176- bPtr [ baseIdx ] = _byteToFloatLut [ p [ 0 ] ] ;
177- gPtr [ baseIdx ] = _byteToFloatLut [ p [ 1 ] ] ;
178- rPtr [ baseIdx ] = _byteToFloatLut [ p [ 2 ] ] ;
179-
180- bPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 4 ] ] ;
181- gPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 5 ] ] ;
182- rPtr [ baseIdx + 1 ] = _byteToFloatLut [ p [ 6 ] ] ;
183-
184- bPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 8 ] ] ;
185- gPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 9 ] ] ;
186- rPtr [ baseIdx + 2 ] = _byteToFloatLut [ p [ 10 ] ] ;
187-
188- bPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 12 ] ] ;
189- gPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 13 ] ] ;
190- rPtr [ baseIdx + 3 ] = _byteToFloatLut [ p [ 14 ] ] ;
191- }
192-
193- // handle the rest of the pixels when width is not divisible by 4
194- for ( ; x < width ; x ++ )
195- {
196- int idx = rowStart + x ;
197- byte * p = row + ( x * bytesPerPixel ) ;
198- bPtr [ idx ] = _byteToFloatLut [ p [ 0 ] ] ;
199- gPtr [ idx ] = _byteToFloatLut [ p [ 1 ] ] ;
200- rPtr [ idx ] = _byteToFloatLut [ p [ 2 ] ] ;
201- }
147+ int idx = rowStart + x ;
148+ byte * p = row + ( x * bytesPerPixel ) ;
149+
150+ // process by BGR(a) value like before
151+ bPtr [ idx ] = _byteToFloatLut [ p [ 0 ] ] ;
152+ gPtr [ idx ] = _byteToFloatLut [ p [ 1 ] ] ;
153+ rPtr [ idx ] = _byteToFloatLut [ p [ 2 ] ] ;
202154 }
203- }
155+ } ) ;
204156 }
205157 }
206158 finally
@@ -217,7 +169,11 @@ public static unsafe void BitmapToFloatArrayInPlace(Bitmap image, float[] result
217169 // I would just like to say now, that python users are extremely lucky: https://onnxruntime.ai/docs/performance/model-optimizations/float16.html
218170
219171
220- // convert single-precision (32-bit) float to half-precision (16-bit) float stored in ushort
172+ /// <summary>
173+ /// convert single-precision (32-bit) float to half-precision (16-bit) float stored in ushort
174+ /// </summary>
175+ /// <param name="f"></param>
176+ /// <returns></returns>
221177 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
222178 public static ushort FloatToHalfBits ( float f )
223179 {
@@ -258,7 +214,11 @@ public static ushort FloatToHalfBits(float f)
258214 return ( ushort ) ( sign | ( exp << 10 ) | mantissa ) ; // store as 16 bit
259215 }
260216
261- // convert half-precision (16-bit) float stored in ushort to single-precision (32-bit) float
217+ /// <summary>
218+ /// convert half-precision (16-bit) float stored in ushort to single-precision (32-bit) float
219+ /// </summary>
220+ /// <param name="h"></param>
221+ /// <returns></returns>
262222 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
263223 public static float HalfBitsToFloat ( ushort h )
264224 {
0 commit comments