Skip to content

Commit 8791a8f

Browse files
committed
add resizer implementation for SAST in FUNQUE
1 parent 0f6e0a8 commit 8791a8f

File tree

13 files changed

+3308
-0
lines changed

13 files changed

+3308
-0
lines changed

libvmaf/meson_options.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,9 @@ option('enable_nvtx',
3737
type: 'boolean',
3838
value: false,
3939
description: 'Enable NVTX range support')
40+
41+
option('enable_sast_resizer',
42+
type: 'boolean',
43+
value: false,
44+
description: 'Compile the SAST resizer from FUNQUE into the library')
45+
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
/**
2+
*
3+
* Copyright (c) 2022-2024 Meta, Inc.
4+
*
5+
* Licensed under the BSD 3-Clause License (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* https://opensource.org/license/bsd-3-clause
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
#include <stdio.h>
20+
#include <math.h>
21+
#include <stdlib.h>
22+
#include <string.h>
23+
#include <arm_neon.h>
24+
#include <time.h>
25+
#include "../resizer.h"
26+
#include "resizer_neon.h"
27+
28+
#if OPTIMISED_COEFF
29+
void hresize_neon(const unsigned char **src, int **dst, int count,
30+
const short *alpha,
31+
int swidth, int dwidth, int cn, int xmin, int xmax)
32+
#else
33+
void hresize_neon(const unsigned char **src, int **dst, int count,
34+
const int *xofs, const short *alpha,
35+
int swidth, int dwidth, int cn, int xmin, int xmax)
36+
#endif
37+
{
38+
// int first_col_count = 0;
39+
uint8x8_t src1_8x8, src2_8x8, src3_8x8;
40+
int simd_loop = (xmax / 8) * 8;
41+
int num_pix = 8;
42+
43+
#if OPTIMISED_COEFF
44+
int sx_start = 2;
45+
#else
46+
int sx_start = xofs[1];
47+
#endif
48+
49+
for (int k = 0; k < count; k++)
50+
{
51+
const unsigned char *S = src[k];
52+
int *D = dst[k];
53+
int dx = 0, limit = xmin;
54+
for (;;)
55+
{
56+
#if OPTIMISED_COEFF
57+
for (; dx < limit; dx++)
58+
{
59+
int j;
60+
int sx = (dx * 2) - cn;
61+
#else
62+
for (; dx < limit; dx++, alpha += 4)
63+
{
64+
int j;
65+
int sx = xofs[dx] - cn;
66+
#endif
67+
int v = 0;
68+
for (j = 0; j < 4; j++)
69+
{
70+
int sxj = sx + j * cn;
71+
if ((unsigned)sxj >= (unsigned)swidth)
72+
{
73+
while (sxj < 0)
74+
sxj += cn;
75+
while (sxj >= swidth)
76+
sxj -= cn;
77+
}
78+
v += S[sxj] * alpha[j];
79+
}
80+
D[dx] = v;
81+
}
82+
if (limit == dwidth)
83+
break;
84+
85+
int start = sx_start - cn;
86+
src1_8x8 = vld1_u8(S + start);
87+
#if OPTIMISED_COEFF
88+
for (; dx < simd_loop;)
89+
{
90+
#else
91+
for (; dx < simd_loop; alpha += 32)
92+
{
93+
#endif
94+
start += num_pix;
95+
src2_8x8 = vld1_u8(S + start);
96+
start += num_pix;
97+
src3_8x8 = vld1_u8(S + start);
98+
99+
uint16x8_t movl1_16x8 = vmovl_u8(src1_8x8);
100+
uint16x8_t movl2_16x8 = vmovl_u8(src2_8x8);
101+
uint16x8_t movl3_16x8 = vmovl_u8(src3_8x8);
102+
int16x8_t s_movl1_16x8 = vreinterpretq_s16_u16(movl1_16x8);
103+
int16x8_t s_movl2_16x8 = vreinterpretq_s16_u16(movl2_16x8);
104+
int16x8_t s_movl3_16x8 = vreinterpretq_s16_u16(movl3_16x8);
105+
int16x8x2_t t1 = vuzpq_s16(s_movl1_16x8, s_movl2_16x8); // 0 odd, 1 even
106+
int16x8x2_t t2 = vuzpq_s16(s_movl3_16x8, s_movl3_16x8);
107+
int16x8_t vx1 = vextq_s16(t1.val[0], t2.val[0], 1); // s_movl3_16x8,1);
108+
int16x8_t vx2 = vextq_s16(t1.val[1], t2.val[1], 1);
109+
int32x4_t m1_l = vmull_n_s16(vget_low_s16(t1.val[0]), alpha[0]);
110+
int32x4_t m1_h = vmull_n_s16(vget_high_s16(t1.val[0]), alpha[0]);
111+
int32x4_t m2_l = vmlal_n_s16(m1_l, vget_low_s16(vx1), alpha[1]);
112+
int32x4_t m2_h = vmlal_n_s16(m1_h, vget_high_s16(vx1), alpha[1]);
113+
int32x4_t m3_l = vmlal_n_s16(m2_l, vget_low_s16(t1.val[1]), alpha[2]);
114+
int32x4_t m3_h = vmlal_n_s16(m2_h, vget_high_s16(t1.val[1]), alpha[2]);
115+
int32x4_t out_l = vmlal_n_s16(m3_l, vget_low_s16(vx2), alpha[3]); // final out
116+
int32x4_t out_h = vmlal_n_s16(m3_h, vget_high_s16(vx2), alpha[3]); // final out
117+
118+
vst1q_s32(D + dx, out_l);
119+
dx += 4;
120+
vst1q_s32(D + dx, out_h);
121+
dx += 4;
122+
src1_8x8 = src3_8x8;
123+
}
124+
125+
#if OPTIMISED_COEFF
126+
for (; dx < xmax; dx++)
127+
{
128+
int sx2 = dx * 2;
129+
#else
130+
for (; dx < xmax; dx++, alpha += 4)
131+
{
132+
int sx2 = xofs[dx]; // sx - 2, 4, 6, 8....
133+
#endif
134+
D[dx] = S[sx2 - 1] * alpha[0] + S[sx2] * alpha[1] + S[sx2 + 1] * alpha[2] + S[sx2 + 2] * alpha[3];
135+
}
136+
limit = dwidth;
137+
}
138+
#if !OPTIMISED_COEFF
139+
alpha -= dwidth * 4;
140+
#endif
141+
}
142+
}
143+
144+
void vresize_neon(const int **src, unsigned char *dst, const short *beta, int width)
145+
{
146+
int32x4_t src_1, src_2, src_3, src_4, src_1_mul;
147+
int32x4_t d4_q;
148+
int32x4_t add_1;
149+
int32x4_t add_delta;
150+
int32x4_t shift_right_32x4;
151+
uint16x4_t shift_right_16x4;
152+
uint16x8_t shift_right_16x8;
153+
int32x4_t dt;
154+
uint8x8_t dt2;
155+
156+
157+
#define BITS 22
158+
int bits = BITS;
159+
160+
// int32x4_t SHIFT = vdupq_n_s32(bits);
161+
int DELTA = (1 << (bits - 1));
162+
// b1_vq = vdupq_n_s32(beta[0]);
163+
// b2_vq = vdupq_n_s32(beta[1]);
164+
// b3_vq = vdupq_n_s32(beta[2]);
165+
// b4_vq = vdupq_n_s32(beta[3]);
166+
d4_q = vdupq_n_s32(DELTA);
167+
src_1_mul = vdupq_n_s32(0);
168+
169+
int32x4_t lower = vdupq_n_s32(0);
170+
int32x4_t higher = vdupq_n_s32(255);
171+
172+
for (int x = 0; x < width; x += 4)
173+
{
174+
src_1 = vld1q_s32(src[0] + x);
175+
src_2 = vld1q_s32(src[1] + x);
176+
src_3 = vld1q_s32(src[2] + x);
177+
src_4 = vld1q_s32(src[3] + x);
178+
179+
add_1 = vmlaq_n_s32(src_1_mul, src_1, beta[0]);
180+
add_1 = vmlaq_n_s32(add_1, src_2, beta[1]);
181+
add_1 = vmlaq_n_s32(add_1, src_3, beta[2]);
182+
add_1 = vmlaq_n_s32(add_1, src_4, beta[3]);
183+
184+
add_delta = vaddq_s32(add_1, d4_q);
185+
186+
shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4
187+
188+
dt = vminq_s32(shift_right_32x4, higher);
189+
dt = vmaxq_s32(dt, lower);
190+
191+
// shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4
192+
193+
shift_right_16x4 = vqmovun_s32(dt); // 16x4
194+
shift_right_16x8 = vcombine_u16(shift_right_16x4, shift_right_16x4); // 16x8
195+
dt2 = vqmovn_u16(shift_right_16x8); // 8x8
196+
197+
vst1_lane_u32((unsigned int *)(dst + x), vreinterpret_u32_u8(dt2), 0);
198+
}
199+
200+
#undef BITS
201+
}
202+
203+
static int clip_neon(int x, int a, int b)
204+
{
205+
return x >= a ? (x < b ? x : b - 1) : a;
206+
}
207+
208+
#if OPTIMISED_COEFF
209+
void step_neon(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
210+
#else
211+
void step_neon(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
212+
#endif
213+
{
214+
int dy, cn = channels;
215+
216+
int bufstep = (int)((dwidth + 16 - 1) & -16);
217+
int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
218+
if (_buffer == NULL)
219+
{
220+
printf("malloc fails\n");
221+
}
222+
const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
223+
int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
224+
int prev_sy[MAX_ESIZE];
225+
226+
for (int k = 0; k < ksize; k++)
227+
{
228+
prev_sy[k] = -1;
229+
rows[k] = _buffer + bufstep * k;
230+
}
231+
232+
#if !OPTIMISED_COEFF
233+
const short *beta = _beta + ksize * start;
234+
#endif
235+
236+
237+
#if OPTIMISED_COEFF
238+
for (dy = start; dy < end; dy++)
239+
{
240+
int sy0 = dy * 2;
241+
#else
242+
for (dy = start; dy < end; dy++, beta += ksize)
243+
{
244+
int sy0 = yofs[dy];
245+
#endif
246+
int k0 = ksize, k1 = 0, ksize2 = ksize / 2;
247+
248+
for (int k = 0; k < ksize; k++)
249+
{
250+
int sy = clip_neon(sy0 - ksize2 + 1 + k, 0, iheight);
251+
for (k1 = MAX(k1, k); k1 < ksize; k1++)
252+
{
253+
if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
254+
{
255+
if (k1 > k)
256+
memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
257+
break;
258+
}
259+
}
260+
if (k1 == ksize)
261+
k0 = MIN(k0, k); // remember the first row that needs to be computed
262+
srows[k] = _src + (sy * iwidth);
263+
prev_sy[k] = sy;
264+
}
265+
266+
267+
268+
#if OPTIMISED_COEFF
269+
if (k0 < ksize)
270+
{
271+
hresize_neon((srows + k0), (rows + k0), ksize - k0, _alpha,
272+
iwidth, dwidth, cn, xmin, xmax);
273+
}
274+
#if USE_C_VRESIZE
275+
vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
276+
#elif !USE_C_VRESIZE
277+
vresize_neon((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
278+
#endif
279+
#else
280+
if (k0 < ksize)
281+
{
282+
hresize_neon((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
283+
iwidth, dwidth, cn, xmin, xmax);
284+
}
285+
#if USE_C_VRESIZE
286+
vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
287+
#elif !USE_C_VRESIZE
288+
vresize_neon((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
289+
#endif
290+
#endif
291+
292+
}
293+
294+
free(_buffer);
295+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/**
2+
*
3+
* Copyright (c) 2022-2024 Meta, Inc.
4+
*
5+
* Licensed under the BSD 3-Clause License (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* https://opensource.org/license/bsd-3-clause
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
#if OPTIMISED_COEFF
20+
void step_neon(const unsigned char *_src, unsigned char *_dst,
21+
const short *_alpha, const short *_beta,
22+
int iwidth, int iheight, int dwidth, int channels,
23+
int ksize, int start, int end, int xmin, int xmax);
24+
#else
25+
void step_neon(const unsigned char *_src, unsigned char *_dst,
26+
const int *xofs, const int *yofs,
27+
const short *_alpha, const short *_beta,
28+
int iwidth, int iheight, int dwidth, int dheight, int channels,
29+
int ksize, int start, int end, int xmin, int xmax);
30+
#endif

0 commit comments

Comments
 (0)