Skip to content

Commit c6fac80

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Apply missing clang-format
1 parent 034eaed commit c6fac80

17 files changed

+380
-196
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,6 +1282,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
12821282
}
12831283

12841284
SDValue Unrolled = DAG.UnrollVectorOp(Node);
1285+
LLVM_DEBUG(dbgs() << "\nUnrolled node: "; Unrolled->dump());
1286+
LLVM_DEBUG(dbgs() << "\n");
12851287
if (Node->getNumValues() == 1) {
12861288
Results.push_back(Unrolled);
12871289
} else {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "llvm/IR/IntrinsicsR600.h"
4141
#include "llvm/IR/MDBuilder.h"
4242
#include "llvm/Support/CommandLine.h"
43+
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
4546
#include "llvm/Transforms/Utils/LowerAtomic.h"
@@ -430,6 +431,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
430431
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431432
}
432433

434+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
435+
// Prevent SELECT from being implemented with the above bitwise ops and
436+
// instead use cndmask.
437+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
438+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
439+
// alignbit.
440+
setOperationAction(ISD::ROTR, MVT::v2i32, Legal);
441+
433442
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434443
Custom);
435444

@@ -835,6 +844,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
835844
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
836845
} else {
837846
// Legalization hack.
847+
// Hmm.
838848
setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
839849

840850
setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
@@ -1986,6 +1996,13 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
19861996
return true;
19871997
}
19881998

1999+
bool SITargetLowering::shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
2000+
EVT VT) const {
2001+
return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
2002+
BinOpcode == ISD::XOR) &&
2003+
VT.getScalarType() == MVT::i64;
2004+
}
2005+
19892006
bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
19902007
unsigned Index) const {
19912008
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -12872,6 +12889,51 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1287212889
}
1287312890
}
1287412891

12892+
// Detect identity v2i32 OR and replace with identity source node.
12893+
// Specifically an Or that has operands constructed from the same source node
12894+
// via extract_vector_elt and build_vector.
12895+
if (VT == MVT::v2i32) {
12896+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12897+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
12898+
// DAG.canonicalizeCommutativeBinop(ISD::OR, RHS, LHS);
12899+
SDValue BVLHS = LHS->getOperand(0);
12900+
SDValue CLHS = LHS->getOperand(1);
12901+
SDValue CRHS = RHS->getOperand(0);
12902+
SDValue BVRHS = RHS->getOperand(1);
12903+
LLVM_DEBUG(dbgs() << "### Performing v2i32 SIISelLowering "
12904+
"DAGCombine::CombineOR\n";);
12905+
12906+
auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
12907+
auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
12908+
12909+
if (LC && RC) {
12910+
12911+
// Test for and normalise build vectors.
12912+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12913+
RHS->getOpcode() == ISD::BUILD_VECTOR &&
12914+
// Check cast to constantnode here
12915+
LHS->getConstantOperandVal(1) == 0 &&
12916+
RHS->getConstantOperandVal(0) == 0) {
12917+
12918+
// Get the extract_vector_element operands.
12919+
SDValue LEVE = LHS->getOperand(0);
12920+
SDValue REVE = RHS->getOperand(1);
12921+
12922+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
12923+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12924+
// Check that the the elements from the same vector are extracted.
12925+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
12926+
LEVE->getOperand(1) != REVE->getOperand(1)) {
12927+
LLVM_DEBUG(dbgs() << "### Found identity OR, folding...\n";);
12928+
SDValue IdentitySrc = LEVE.getOperand(0);
12929+
return IdentitySrc;
12930+
}
12931+
}
12932+
}
12933+
}
12934+
}
12935+
}
12936+
1287512937
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1287612938
return SDValue();
1287712939

@@ -12916,13 +12978,43 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1291612978
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1291712979
return RV;
1291812980

12981+
SelectionDAG &DAG = DCI.DAG;
12982+
EVT VT = N->getValueType(0);
1291912983
SDValue LHS = N->getOperand(0);
1292012984
SDValue RHS = N->getOperand(1);
1292112985

12986+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
12987+
12988+
const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
12989+
const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12990+
SDValue LHS_0 = LHS.getOperand(0);
12991+
SDValue LHS_1 = LHS.getOperand(1);
12992+
12993+
if (LHS.getOpcode() == ISD::VSELECT && VT == MVT::v2i32) {
12994+
if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
12995+
shouldFoldFNegIntoSrc(N, LHS_0))
12996+
if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
12997+
shouldFoldFNegIntoSrc(N, LHS_1)) {
12998+
SDLoc DL(N);
12999+
SDValue CastLHS =
13000+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13001+
SDValue CastRHS =
13002+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13003+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13004+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13005+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13006+
LHS->getOperand(0), FNegLHS, FNegRHS);
13007+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13008+
}
13009+
}
13010+
// Possibly split vector here if one side does have a constant RHS.
13011+
}
13012+
13013+
// Add test for when only one of the RHS vector elements is a const. Might be
13014+
// possible to optimise this case.
13015+
1292213016
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12923-
SelectionDAG &DAG = DCI.DAG;
1292413017

12925-
EVT VT = N->getValueType(0);
1292613018
if (CRHS && VT == MVT::i64) {
1292713019
if (SDValue Split =
1292813020
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
366366
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
367367
Type *Ty) const override;
368368

369+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
370+
EVT VT) const override;
371+
369372
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
370373
unsigned Index) const override;
371374
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,9 +2334,9 @@ def : AMDGPUPatIgnoreCopies <
23342334
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
23352335
>;
23362336

2337-
// 64-bit version
2337+
foreach vt = [i64, v2i32] in {
23382338
def : AMDGPUPatIgnoreCopies <
2339-
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2339+
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
23402340
(REG_SEQUENCE VReg_64,
23412341
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
23422342
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2345,6 +2345,7 @@ def : AMDGPUPatIgnoreCopies <
23452345
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
23462346
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
23472347
>;
2348+
}
23482349

23492350
def : AMDGPUPat <
23502351
(fcopysign f32:$src0, f32:$src1),
@@ -2378,16 +2379,54 @@ def : AMDGPUPat <
23782379
let True16Predicate = NotHasTrue16BitInsts in {
23792380
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
23802381

2382+
def : AMDGPUPat <
2383+
(rotr v2i32:$src0, v2i32:$src1),
2384+
(REG_SEQUENCE VReg_64,
2385+
(V_ALIGNBIT_B32_e64
2386+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2387+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2388+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub0))), sub0,
2389+
(V_ALIGNBIT_B32_e64
2390+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2391+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2392+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub1))), sub1)
2393+
>;
2394+
2395+
// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine() when v2i32 or is legal.
2396+
def : AMDGPUPat <
2397+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2398+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2399+
23812400
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
23822401
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
23832402
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
23842403

23852404
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
23862405
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
23872406
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2407+
2408+
def : GCNPat <
2409+
(rotr v2i32:$src0, v2i32:$src1),
2410+
(REG_SEQUENCE VReg_64,
2411+
(V_ALIGNBIT_B32_e64
2412+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2413+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2414+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub0))), sub0,
2415+
(V_ALIGNBIT_B32_e64
2416+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2417+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2418+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub1))), sub1)
2419+
>;
2420+
23882421
} // end True16Predicate = NotHasTrue16BitInsts
23892422

23902423
let True16Predicate = UseRealTrue16Insts in {
2424+
2425+
// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine() when v2i32 or is legal.
2426+
def : AMDGPUPat <
2427+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2428+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2429+
23912430
def : GCNPat <
23922431
(rotr i32:$src0, i32:$src1),
23932432
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
@@ -2397,6 +2436,20 @@ def : GCNPat <
23972436
/* clamp */ 0, /* op_sel */ 0)
23982437
>;
23992438

2439+
def : GCNPat <
2440+
(rotr v2i32:$src0, v2i32:$src1),
2441+
(REG_SEQUENCE VReg_64,
2442+
(V_ALIGNBIT_B32_t16_e64
2443+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2444+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2445+
0, (EXTRACT_SUBREG (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)) ,lo16),0,0), sub0,
2446+
(V_ALIGNBIT_B32_t16_e64
2447+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2448+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2449+
0, (EXTRACT_SUBREG (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)) ,lo16),0,0), sub1)
2450+
>;
2451+
2452+
24002453
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
24012454
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
24022455
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2415,6 +2468,12 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
24152468
} // end True16Predicate = UseRealTrue16Insts
24162469

24172470
let True16Predicate = UseFakeTrue16Insts in {
2471+
2472+
// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine() when v2i32 or is legal.
2473+
def : AMDGPUPat <
2474+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2475+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2476+
24182477
def : GCNPat <
24192478
(rotr i32:$src0, i32:$src1),
24202479
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
@@ -2423,6 +2482,20 @@ def : GCNPat <
24232482
$src1, /* clamp */ 0, /* op_sel */ 0)
24242483
>;
24252484

2485+
def : GCNPat <
2486+
(rotr v2i32:$src0, v2i32:$src1),
2487+
(REG_SEQUENCE VReg_64,
2488+
(V_ALIGNBIT_B32_fake16_e64
2489+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2490+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2491+
0, (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)),0,0), sub0,
2492+
(V_ALIGNBIT_B32_fake16_e64
2493+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2494+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2495+
0, (i32 (EXTRACT_SUBREG VReg_64:$src1, sub1)),0,0), sub1)
2496+
>;
2497+
2498+
24262499
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
24272500
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
24282501
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2449,6 +2522,7 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
24492522
>;
24502523
} // end True16Predicate = UseFakeTrue16Insts
24512524

2525+
24522526
/********** ====================== **********/
24532527
/********** Indirect addressing **********/
24542528
/********** ====================== **********/

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,21 @@ def : GCNPat <
17791779
(S_MOV_B32 imm:$imm)
17801780
>;
17811781

1782+
def : GCNPat <
1783+
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
1784+
(S_AND_B64 SReg_64:$x, SReg_64:$y)
1785+
>;
1786+
1787+
def : GCNPat <
1788+
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
1789+
(S_OR_B64 SReg_64:$x, SReg_64:$y)
1790+
>;
1791+
1792+
def : GCNPat <
1793+
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
1794+
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
1795+
>;
1796+
17821797
// Same as a 32-bit inreg
17831798
def : GCNPat<
17841799
(i32 (UniformUnaryFrag<sext> i16:$src)),

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
954954
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
955955
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
956956

957-
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
957+
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
958958
GCNPat<
959-
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
959+
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
960960
(REG_SEQUENCE VReg_64,
961961
(Inst
962962
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,11 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
973973
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
974974
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
975975

976+
def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
977+
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
978+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
979+
980+
976981
// mul24 w/ 64 bit output.
977982
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
978983
(i64 (Op i32:$src0, i32:$src1)),

llvm/test/CodeGen/AMDGPU/and.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
88
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1010

11-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
12-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11+
; SI: s_and_b64
1312

1413
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1514
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
151151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
152152
; GFX-950: ; %bb.0:
153153
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154+
; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
155+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
154156
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156157
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
158+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
159+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
159160
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
160161
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
161-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
162-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
162+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
163163
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165-
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
164+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
165+
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
166166
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
167-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
168-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
167+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
168+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
169169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
170-
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
171-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
172-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
170+
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
171+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
172+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
173173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174174
; GFX-950-NEXT: ; return to shader part epilog
175175
%res = fptrunc <2 x double> %src to <2 x bfloat>

0 commit comments

Comments
 (0)