Skip to content

Commit 71b1e29

Browse files
committed
Tighten up the code for performFNEGCombine to limit the applicable types
1 parent 9550d6a commit 71b1e29

File tree

6 files changed

+157
-3965
lines changed

6 files changed

+157
-3965
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5066,15 +5066,20 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
50665066
}
50675067
case ISD::SELECT: {
50685068
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5069-
// This combine became necessary recently to prevent a regression after v2i32 xor was made legal.
5070-
// When adding this combine a case was added to performFNEGCombine to prevent this combine from
5071-
// being undone under certain conditions.
5069+
// This combine became necessary recently to prevent a regression in
5070+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5071+
// Specifically, additional instructions were added to the final codegen.
5072+
// When adding this combine a case was added to performFNEGCombine to
5073+
// prevent this combine from being undone under certain conditions.
50725074
// TODO: Invert conditions of foldFreeOpFromSelect
50735075
SDValue Cond = N0.getOperand(0);
50745076
SDValue LHS = N0.getOperand(1);
50755077
SDValue RHS = N0.getOperand(2);
50765078
EVT LHVT = LHS.getValueType();
50775079
EVT RHVT = RHS.getValueType();
5080+
// The regression was limited to i32 v2/i32.
5081+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5082+
return SDValue();
50785083

50795084
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
50805085
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5938,10 +5938,10 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59385938
}
59395939

59405940
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
5941-
// regression caused by legalising v2i32 or.
5941+
// regression in rotr.ll, whereby extra unnecessary instructions were added to
5942+
// the final codegen caused by legalising v2i32 or.
59425943
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5943-
unsigned Opc = Op.getOpcode();
5944-
EVT VT = Op.getValueType();
5944+
[[maybe_unused]] EVT VT = Op.getValueType();
59455945

59465946
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
59475947
VT == MVT::v16i32) &&
@@ -12998,7 +12998,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1299812998
SDValue LHS_0 = LHS.getOperand(0);
1299912999
SDValue LHS_1 = LHS.getOperand(1);
1300013000

13001-
if (LHS.getOpcode() == ISD::VSELECT && VT == MVT::v2i32) {
13001+
if (LHS.getOpcode() == ISD::VSELECT) {
1300213002
if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
1300313003
shouldFoldFNegIntoSrc(N, LHS_0))
1300413004
if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
@@ -13015,12 +13015,8 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1301513015
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
1301613016
}
1301713017
}
13018-
// Possibly split vector here if one side does have a constant RHS.
1301913018
}
1302013019

13021-
// Add test for when only one of the RHS vector elements is a const. Might be
13022-
// possible to optimise this case.
13023-
1302413020
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
1302513021

1302613022
if (CRHS && VT == MVT::i64) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,7 +1793,6 @@ def : GCNPat <
17931793
>;
17941794
}
17951795

1796-
17971796
/********** ================================ **********/
17981797
/********** Floating point absolute/negative **********/
17991798
/********** ================================ **********/
@@ -2389,31 +2388,25 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
23892388
} // end True16Predicate = NotHasTrue16BitInsts
23902389

23912390
let True16Predicate = UseRealTrue16Insts in {
2391+
def : GCNPat<(rotr i32:$src0, i32:$src1),
2392+
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2393+
/* src1_modifiers */ 0, $src0,
2394+
/* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16),
2395+
/* clamp */ 0, /* op_sel */ 0)>;
23922396

2393-
def : GCNPat <
2394-
(rotr i32:$src0, i32:$src1),
2395-
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2396-
/* src1_modifiers */ 0, $src0,
2397-
/* src2_modifiers */ 0,
2398-
(EXTRACT_SUBREG $src1, lo16),
2399-
/* clamp */ 0, /* op_sel */ 0)
2400-
>;
2401-
2402-
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2403-
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
2404-
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2405-
0, /* src1_modifiers */
2406-
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2407-
0, /* src2_modifiers */
2408-
(i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
2409-
/* clamp */ 0, /* op_sel */ 0)>;
2410-
2411-
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2412-
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2413-
/* src1_modifiers */ 0, $src1,
2414-
/* src2_modifiers */ 0,
2415-
(EXTRACT_SUBREG VGPR_32:$src2, lo16),
2416-
/* clamp */ 0, /* op_sel */ 0)>;
2397+
def : GCNPat<
2398+
(i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2399+
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
2400+
(i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */
2401+
(i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */
2402+
(i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)),
2403+
/* clamp */ 0, /* op_sel */ 0)>;
2404+
2405+
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2406+
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2407+
/* src1_modifiers */ 0, $src1,
2408+
/* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16),
2409+
/* clamp */ 0, /* op_sel */ 0)>;
24172410
} // end True16Predicate = UseRealTrue16Insts
24182411

24192412
let True16Predicate = UseFakeTrue16Insts in {
@@ -2451,7 +2444,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
24512444
>;
24522445
} // end True16Predicate = UseFakeTrue16Insts
24532446

2454-
24552447
/********** ====================== **********/
24562448
/********** Indirect addressing **********/
24572449
/********** ====================== **********/

0 commit comments

Comments
 (0)