Skip to content

Commit 0cf9ea8

Browse files
committed
GS/SW: Use non-saturating ARM instructions for color gradient setup.
This is more efficient on ARM, though the equivalent instructions are not currently used in the x64 JIT and C++ versions of GSVector. Co-authored-by: TellowKrinkle
1 parent 941e6a4 commit 0cf9ea8

File tree

1 file changed

+11
-28
lines changed

1 file changed

+11
-28
lines changed

pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -225,17 +225,13 @@ void GSSetupPrimCodeGenerator::Color()
225225
// GSVector4 c = dscan.c;
226226
armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
227227

228-
// constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
229-
armAsm->Movi(v17.V4S(), 0xFFFF);
230-
231-
// local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
228+
// GSVector4i tmp = GSVector4i(dscan.c * step_shift).xzyw();
229+
// local.d4.c = tmp.uzp1_16(tmp); // Not currently in GSVector since that's mainly targeting x86 for now
232230
armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
233231
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
234-
armAsm->And(v2.V4S(), v17.V4S());
235232
armAsm->Rev64(_vscratch.V4S(), v2.V4S());
236233
armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
237-
armAsm->Uqxtn(v2.V4H(), v2.V4S());
238-
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
234+
armAsm->Uzp1(v2.V8H(), v2.V8H(), v2.V8H());
239235
armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
240236

241237
// GSVector4 dr = c.xxxx();
@@ -246,25 +242,18 @@ void GSSetupPrimCodeGenerator::Color()
246242

247243
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
248244
{
249-
// VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
245+
// VectorI r = VectorI(dr * shift[1 + i]);
250246

251247
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
252248
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
253-
armAsm->And(v2.V4S(), v17.V4S());
254-
armAsm->Uqxtn(v2.V4H(), v2.V4S());
255-
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
256249

257-
// VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
250+
// VectorI b = VectorI(db * shift[1 + i]);
258251

259252
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
260253
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
261-
armAsm->And(v3.V4S(), v17.V4S());
262-
armAsm->Uqxtn(v3.V4H(), v3.V4S());
263-
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
264-
265-
// m_local.d[i].rb = r.upl16(b);
266254

267-
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
255+
// m_local.d[i].rb = r.trn1_16(b); // Not currently in GSVector since that's mainly targeting x86 for now
256+
armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
268257
armAsm->Str(v2, _local(d[i].rb));
269258
}
270259

@@ -278,25 +267,19 @@ void GSSetupPrimCodeGenerator::Color()
278267

279268
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
280269
{
281-
// VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
270+
// VectorI g = VectorI(dg * shift[1 + i]);
282271

283272
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
284273
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
285-
armAsm->And(v2.V4S(), v17.V4S());
286-
armAsm->Uqxtn(v2.V4H(), v2.V4S());
287-
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
288274

289-
// VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
275+
// VectorI a = VectorI(da * shift[1 + i]);
290276

291277
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
292278
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
293-
armAsm->And(v3.V4S(), v17.V4S());
294-
armAsm->Uqxtn(v3.V4H(), v3.V4S());
295-
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
296279

297-
// m_local.d[i].ga = g.upl16(a);
280+
// m_local.d[i].ga = g.trn1_16(a); // Not currently in GSVector since that's mainly targeting x86 for now
298281

299-
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
282+
armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
300283
armAsm->Str(v2, _local(d[i].ga));
301284
}
302285
}

0 commit comments

Comments
 (0)