Skip to content

Commit 5adb38c

Browse files
committed
AMDGPU: Try to unspill VGPRs after rewriting MFMAs to AGPR form
After replacing VGPR MFMAs with the AGPR form, we've alleviated VGPR pressure which may have triggered spills during allocation. Identify these spill slots, and try to reassign them to newly freed VGPRs, and replace the spill instructions with copies. Fixes #154260
1 parent 883e110 commit 5adb38c

File tree

2 files changed

+172
-41
lines changed

2 files changed

+172
-41
lines changed

llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp

Lines changed: 165 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "SIRegisterInfo.h"
2929
#include "llvm/CodeGen/LiveIntervals.h"
3030
#include "llvm/CodeGen/LiveRegMatrix.h"
31+
#include "llvm/CodeGen/LiveStacks.h"
3132
#include "llvm/CodeGen/MachineFunctionPass.h"
3233
#include "llvm/CodeGen/VirtRegMap.h"
3334
#include "llvm/InitializePasses.h"
@@ -38,6 +39,9 @@ using namespace llvm;
3839

3940
namespace {
4041

42+
/// Map from spill slot frame index to list of instructions which reference it.
43+
using SpillReferenceMap = DenseMap<int, SmallVector<MachineInstr *, 4>>;
44+
4145
class AMDGPURewriteAGPRCopyMFMAImpl {
4246
MachineFunction &MF;
4347
const GCNSubtarget &ST;
@@ -47,6 +51,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
4751
VirtRegMap &VRM;
4852
LiveRegMatrix &LRM;
4953
LiveIntervals &LIS;
54+
LiveStacks &LSS;
5055
const RegisterClassInfo &RegClassInfo;
5156

5257
bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
@@ -55,10 +60,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
5560
public:
5661
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
5762
LiveRegMatrix &LRM, LiveIntervals &LIS,
63+
LiveStacks &LSS,
5864
const RegisterClassInfo &RegClassInfo)
5965
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
6066
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
61-
LIS(LIS), RegClassInfo(RegClassInfo) {}
67+
LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
6268

6369
bool isRewriteCandidate(const MachineInstr &MI) const {
6470
return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -106,6 +112,22 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
106112

107113
bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
108114
bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
115+
116+
/// Replace spill instruction \p SpillMI which loads/stores from/to \p SpillFI
117+
/// with a COPY to the replacement register value \p VReg.
118+
void replaceSpillWithCopyToVReg(MachineInstr &SpillMI, int SpillFI,
119+
Register VReg) const;
120+
121+
/// Create a map from frame index to use instructions for spills. If a use of
122+
/// the frame index does not consist only of spill instructions, it will not
123+
/// be included in the map.
124+
void collectSpillIndexUses(ArrayRef<LiveInterval *> StackIntervals,
125+
SpillReferenceMap &Map) const;
126+
127+
/// Attempt to unspill VGPRs by finding a free register and replacing the
128+
/// spill instructions with copies.
129+
void eliminateSpillsOfReassignedVGPRs() const;
130+
109131
bool run(MachineFunction &MF) const;
110132
};
111133

@@ -392,6 +414,133 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
392414
return MadeChange;
393415
}
394416

417+
void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
418+
MachineInstr &SpillMI, int SpillFI, Register VReg) const {
419+
const DebugLoc &DL = SpillMI.getDebugLoc();
420+
MachineBasicBlock &MBB = *SpillMI.getParent();
421+
MachineInstr *NewCopy;
422+
if (SpillMI.mayStore()) {
423+
NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY), VReg)
424+
.add(SpillMI.getOperand(0));
425+
} else {
426+
NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY))
427+
.add(SpillMI.getOperand(0))
428+
.addReg(VReg);
429+
}
430+
431+
LIS.ReplaceMachineInstrInMaps(SpillMI, *NewCopy);
432+
SpillMI.eraseFromParent();
433+
}
434+
435+
void AMDGPURewriteAGPRCopyMFMAImpl::collectSpillIndexUses(
436+
ArrayRef<LiveInterval *> StackIntervals, SpillReferenceMap &Map) const {
437+
438+
SmallSet<int, 4> NeededFrameIndexes;
439+
for (const LiveInterval *LI : StackIntervals)
440+
NeededFrameIndexes.insert(LI->reg().stackSlotIndex());
441+
442+
for (MachineBasicBlock &MBB : MF) {
443+
for (MachineInstr &MI : MBB) {
444+
for (MachineOperand &MO : MI.operands()) {
445+
if (!MO.isFI() || !NeededFrameIndexes.count(MO.getIndex()))
446+
continue;
447+
448+
SmallVector<MachineInstr *, 4> &References = Map[MO.getIndex()];
449+
if (TII.isVGPRSpill(MI)) {
450+
References.push_back(&MI);
451+
break;
452+
}
453+
454+
// Verify this was really a spill instruction, if it's not just ignore
455+
// all uses.
456+
457+
// TODO: This should probably be verifier enforced.
458+
NeededFrameIndexes.erase(MO.getIndex());
459+
Map.erase(MO.getIndex());
460+
}
461+
}
462+
}
463+
}
464+
465+
void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
466+
unsigned NumSlots = LSS.getNumIntervals();
467+
if (NumSlots == 0)
468+
return;
469+
470+
MachineFrameInfo &MFI = MF.getFrameInfo();
471+
472+
SmallVector<LiveInterval *, 32> StackIntervals;
473+
StackIntervals.reserve(NumSlots);
474+
475+
for (auto I = LSS.begin(), E = LSS.end(); I != E; ++I) {
476+
int Slot = I->first;
477+
if (!MFI.isSpillSlotObjectIndex(Slot) || MFI.isDeadObjectIndex(Slot))
478+
continue;
479+
480+
LiveInterval &LI = I->second;
481+
const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
482+
if (TRI.hasVGPRs(RC))
483+
StackIntervals.push_back(&LI);
484+
}
485+
486+
/// Sort heaviest intervals first to prioritize their unspilling
487+
sort(StackIntervals, [](const LiveInterval *A, const LiveInterval *B) {
488+
return A->weight() > B->weight();
489+
});
490+
491+
// FIXME: The APIs for dealing with the LiveInterval of a frame index are
492+
// cumbersome. LiveStacks owns its LiveIntervals which refer to stack
493+
// slots. We cannot use the usual LiveRegMatrix::assign and unassign on these,
494+
// and must create a substitute virtual register to do so. This makes
495+
// incremental updating here difficult; we need to actually perform the IR
496+
// mutation to get the new vreg references in place to compute the register
497+
// LiveInterval to perform an assignment to track the new interference
498+
// correctly, and we can't simply migrate the LiveInterval we already have.
499+
//
500+
// To avoid walking through the entire function for each index, pre-collect
501+
// all the instructions slot referencess.
502+
503+
DenseMap<int, SmallVector<MachineInstr *, 4>> SpillSlotReferences;
504+
collectSpillIndexUses(StackIntervals, SpillSlotReferences);
505+
506+
for (LiveInterval *LI : StackIntervals) {
507+
int Slot = LI->reg().stackSlotIndex();
508+
auto SpillReferences = SpillSlotReferences.find(Slot);
509+
if (SpillReferences == SpillSlotReferences.end())
510+
continue;
511+
512+
const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
513+
514+
LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
515+
<< " by reassigning\n");
516+
517+
ArrayRef<MCPhysReg> AllocOrder = RegClassInfo.getOrder(RC);
518+
519+
for (MCPhysReg PhysReg : AllocOrder) {
520+
if (LRM.checkInterference(*LI, PhysReg) != LiveRegMatrix::IK_Free)
521+
continue;
522+
523+
LLVM_DEBUG(dbgs() << "Reassigning " << *LI << " to "
524+
<< printReg(PhysReg, &TRI) << '\n');
525+
526+
const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
527+
Register NewVReg = MRI.createVirtualRegister(RC);
528+
529+
for (MachineInstr *SpillMI : SpillReferences->second)
530+
replaceSpillWithCopyToVReg(*SpillMI, Slot, NewVReg);
531+
532+
// TODO: We should be able to transfer the information from the stack
533+
// slot's LiveInterval without recomputing from scratch with the
534+
// replacement vreg uses.
535+
LiveInterval &NewLI = LIS.createAndComputeVirtRegInterval(NewVReg);
536+
VRM.grow();
537+
LRM.assign(NewLI, PhysReg);
538+
MFI.RemoveStackObject(Slot);
539+
break;
540+
}
541+
}
542+
}
543+
395544
bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
396545
// This only applies on subtargets that have a configurable AGPR vs. VGPR
397546
// allocation.
@@ -418,6 +567,12 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
418567
MadeChange = true;
419568
}
420569

570+
// If we've successfully rewritten some MFMAs, we've alleviated some VGPR
571+
// pressure. See if we can eliminate some spills now that those registers are
572+
// more available.
573+
if (MadeChange)
574+
eliminateSpillsOfReassignedVGPRs();
575+
421576
return MadeChange;
422577
}
423578

@@ -441,10 +596,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
441596
AU.addRequired<LiveIntervalsWrapperPass>();
442597
AU.addRequired<VirtRegMapWrapperLegacy>();
443598
AU.addRequired<LiveRegMatrixWrapperLegacy>();
599+
AU.addRequired<LiveStacksWrapperLegacy>();
444600

445601
AU.addPreserved<LiveIntervalsWrapperPass>();
446602
AU.addPreserved<VirtRegMapWrapperLegacy>();
447603
AU.addPreserved<LiveRegMatrixWrapperLegacy>();
604+
AU.addPreserved<LiveStacksWrapperLegacy>();
605+
448606
AU.setPreservesAll();
449607
MachineFunctionPass::getAnalysisUsage(AU);
450608
}
@@ -457,6 +615,7 @@ INITIALIZE_PASS_BEGIN(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
457615
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
458616
INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
459617
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
618+
INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
460619
INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
461620
"AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
462621

@@ -475,8 +634,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
475634
auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
476635
auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
477636
auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
478-
479-
AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
637+
auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
638+
AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
480639
return Impl.run(MF);
481640
}
482641

@@ -486,13 +645,15 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
486645
VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
487646
LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
488647
LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
648+
LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
489649
RegisterClassInfo RegClassInfo;
490650
RegClassInfo.runOnMachineFunction(MF);
491651

492-
AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
652+
AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
493653
if (!Impl.run(MF))
494654
return PreservedAnalyses::all();
495655
auto PA = getMachineFunctionPassPreservedAnalyses();
496656
PA.preserveSet<CFGAnalyses>();
657+
PA.preserve<LiveStacksAnalysis>();
497658
return PA;
498659
}

llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,8 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
101101
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
102102
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
103103
; CHECK-NEXT: ;;#ASMSTART
104-
; CHECK-NEXT: ; def v[0:3]
104+
; CHECK-NEXT: ; def v[10:13]
105105
; CHECK-NEXT: ;;#ASMEND
106-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
107-
; CHECK-NEXT: s_nop 0
108-
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
109-
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
110-
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
111106
; CHECK-NEXT: v_mov_b32_e32 v0, 0
112107
; CHECK-NEXT: ;;#ASMSTART
113108
; CHECK-NEXT: ; def a[0:31]
@@ -147,12 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
147142
; CHECK-NEXT: s_waitcnt vmcnt(0)
148143
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
149144
; CHECK-NEXT: s_waitcnt vmcnt(0)
150-
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
151-
; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
152-
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
153-
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
154-
; CHECK-NEXT: s_waitcnt vmcnt(0)
155-
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
145+
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
156146
; CHECK-NEXT: s_waitcnt vmcnt(0)
157147
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
158148
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -311,26 +301,16 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
311301
; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
312302
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
313303
; CHECK-NEXT: v_accvgpr_read_b32 v7, a3
304+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
314305
; CHECK-NEXT: v_accvgpr_read_b32 v6, a2
315306
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
316307
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
317308
; CHECK-NEXT: ;;#ASMSTART
318-
; CHECK-NEXT: ; def v[0:3]
309+
; CHECK-NEXT: ; def v[14:17]
319310
; CHECK-NEXT: ;;#ASMEND
320-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
321-
; CHECK-NEXT: s_nop 0
322-
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
323-
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
324-
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
325311
; CHECK-NEXT: ;;#ASMSTART
326-
; CHECK-NEXT: ; def v[0:3]
312+
; CHECK-NEXT: ; def v[10:13]
327313
; CHECK-NEXT: ;;#ASMEND
328-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
329-
; CHECK-NEXT: s_nop 0
330-
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
331-
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
332-
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
333-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
334314
; CHECK-NEXT: ;;#ASMSTART
335315
; CHECK-NEXT: ; def a[0:31]
336316
; CHECK-NEXT: ;;#ASMEND
@@ -369,19 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
369349
; CHECK-NEXT: s_waitcnt vmcnt(0)
370350
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
371351
; CHECK-NEXT: s_waitcnt vmcnt(0)
372-
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
373-
; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
374-
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
375-
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
376-
; CHECK-NEXT: s_waitcnt vmcnt(0)
377-
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
378-
; CHECK-NEXT: s_waitcnt vmcnt(0)
379-
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
380-
; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
381-
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
382-
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
352+
; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
383353
; CHECK-NEXT: s_waitcnt vmcnt(0)
384-
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
354+
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
385355
; CHECK-NEXT: s_waitcnt vmcnt(0)
386356
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
387357
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload

0 commit comments

Comments
 (0)