Skip to content

Commit 1d3ce85

Browse files
committed
GS/SW: Mask color gradients to prevent incorrect clamping.
Co-authored-by: TellowKrinkle
1 parent b003ead commit 1d3ce85

File tree

3 files changed

+42
-26
lines changed

3 files changed

+42
-26
lines changed

pcsx2/GS/Renderers/SW/GSDrawScanline.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,10 +323,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
323323
{
324324
if (sel.iip)
325325
{
326+
constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
326327
#if _M_SSE >= 0x501
327-
GSVector4i::storel(&local.d8.c, GSVector4i(dscan.c * step_shift).xzyw().ps32());
328+
GSVector4i::storel(&local.d8.c, (GSVector4i(dscan.c * step_shift) & GSVector4i::cast(mask16)).xzyw().pu32());
328329
#else
329-
local.d4.c = GSVector4i(dscan.c * step_shift).xzyw().ps32();
330+
local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
330331
#endif
331332
VectorF dc(dscan.c);
332333

@@ -335,8 +336,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
335336

336337
for (int i = 0; i < vlen; i++)
337338
{
338-
VectorI r = VectorI(dr * shift[1 + i]).ps32();
339-
VectorI b = VectorI(db * shift[1 + i]).ps32();
339+
VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
340+
VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
340341

341342
local.d[i].rb = r.upl16(b);
342343
}
@@ -346,8 +347,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
346347

347348
for (int i = 0; i < vlen; i++)
348349
{
349-
VectorI g = VectorI(dg * shift[1 + i]).ps32();
350-
VectorI a = VectorI(da * shift[1 + i]).ps32();
350+
VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
351+
VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
351352

352353
local.d[i].ga = g.upl16(a);
353354
}

pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -398,12 +398,16 @@ void GSSetupPrimCodeGenerator::Color()
398398

399399
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);
400400

401-
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
401+
// constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
402+
pcmpeqd(xym15, xym15);
403+
psrld(xym15, 16);
402404

405+
// local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
403406
THREEARG(mulps, xmm1, xmm0, xmm3);
404407
cvttps2dq(xmm1, xmm1);
405408
pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
406-
packssdw(xmm1, xmm1);
409+
pand(xym1, xym15);
410+
packusdw(xmm1, xmm1);
407411
if (isXmm)
408412
movdqa(_rip_local_d(c), xmm1);
409413
else
@@ -419,23 +423,25 @@ void GSSetupPrimCodeGenerator::Color()
419423

420424
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
421425
{
422-
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
426+
// VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
423427

424428
if (i < 4 || many_regs)
425429
THREEARG(mulps, xym0, XYm(4 + i), xym2);
426430
else
427431
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
428432
cvttps2dq(xym0, xym0);
429-
packssdw(xym0, xym0);
433+
pand(xym0, xym15);
434+
packusdw(xym0, xym0);
430435

431-
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
436+
// VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
432437

433438
if (i < 4 || many_regs)
434439
THREEARG(mulps, xym1, XYm(4 + i), xym3);
435440
else
436441
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
437442
cvttps2dq(xym1, xym1);
438-
packssdw(xym1, xym1);
443+
pand(xym1, xym15);
444+
packusdw(xym1, xym1);
439445

440446
// m_local.d[i].rb = r.upl16(b);
441447

@@ -455,23 +461,25 @@ void GSSetupPrimCodeGenerator::Color()
455461

456462
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
457463
{
458-
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
464+
// VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
459465

460466
if (i < 4 || many_regs)
461467
THREEARG(mulps, xym0, XYm(4 + i), xym2);
462468
else
463469
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
464470
cvttps2dq(xym0, xym0);
465-
packssdw(xym0, xym0);
471+
pand(xym0, xym15);
472+
packusdw(xym0, xym1);
466473

467-
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
474+
// VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
468475

469476
if (i < 4 || many_regs)
470477
THREEARG(mulps, xym1, XYm(4 + i), xym3);
471478
else
472479
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
473480
cvttps2dq(xym1, xym1);
474-
packssdw(xym1, xym1);
481+
pand(xym1, xym15);
482+
packusdw(xym1, xym1);
475483

476484
// m_local.d[i].ga = g.upl16(a);
477485

pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -225,13 +225,16 @@ void GSSetupPrimCodeGenerator::Color()
225225
// GSVector4 c = dscan.c;
226226
armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
227227

228-
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
228+
// constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
229+
armAsm->Movi(v17.V4S(), 0xFFFF);
229230

231+
// local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
230232
armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
231233
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
234+
armAsm->And(v2.V4S(), v17.V4S());
232235
armAsm->Rev64(_vscratch.V4S(), v2.V4S());
233236
armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
234-
armAsm->Sqxtn(v2.V4H(), v2.V4S());
237+
armAsm->Uqxtn(v2.V4H(), v2.V4S());
235238
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
236239
armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
237240

@@ -243,18 +246,20 @@ void GSSetupPrimCodeGenerator::Color()
243246

244247
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
245248
{
246-
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
249+
// VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
247250

248251
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
249252
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
250-
armAsm->Sqxtn(v2.V4H(), v2.V4S());
253+
armAsm->And(v2.V4S(), v17.V4S());
254+
armAsm->Uqxtn(v2.V4H(), v2.V4S());
251255
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
252256

253-
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
257+
// VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
254258

255259
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
256260
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
257-
armAsm->Sqxtn(v3.V4H(), v3.V4S());
261+
armAsm->And(v3.V4S(), v17.V4S());
262+
armAsm->Uqxtn(v3.V4H(), v3.V4S());
258263
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
259264

260265
// m_local.d[i].rb = r.upl16(b);
@@ -273,18 +278,20 @@ void GSSetupPrimCodeGenerator::Color()
273278

274279
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
275280
{
276-
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
281+
// VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
277282

278283
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
279284
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
280-
armAsm->Sqxtn(v2.V4H(), v2.V4S());
285+
armAsm->And(v2.V4S(), v17.V4S());
286+
armAsm->Uqxtn(v2.V4H(), v2.V4S());
281287
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
282288

283-
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
289+
// VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
284290

285291
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
286292
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
287-
armAsm->Sqxtn(v3.V4H(), v3.V4S());
293+
armAsm->And(v3.V4S(), v17.V4S());
294+
armAsm->Uqxtn(v3.V4H(), v3.V4S());
288295
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
289296

290297
// m_local.d[i].ga = g.upl16(a);

0 commit comments

Comments
 (0)