diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 20b5fd94aba94..f433b5aec5162 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,12 +14,7 @@ /// MFMA opcode. /// /// TODO: -/// - Handle non-tied dst+src2 cases. We need to try to find a copy from an -/// AGPR from src2, or reassign src2 to an available AGPR (which should work -/// in the common case of a load). -/// -/// - Handle multiple MFMA uses of the same register. e.g. chained MFMAs that -/// can be rewritten as a set +/// - Handle SplitKit partial copy bundles, and not just full copy instructions /// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// @@ -49,66 +44,149 @@ class AMDGPURewriteAGPRCopyMFMAImpl { VirtRegMap &VRM; LiveRegMatrix &LRM; LiveIntervals &LIS; + const RegisterClassInfo &RegClassInfo; + + bool attemptReassignmentsToAGPR(SmallSetVector &InterferingRegs, + MCPhysReg PrefPhysReg) const; public: AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, - LiveRegMatrix &LRM, LiveIntervals &LIS) + LiveRegMatrix &LRM, LiveIntervals &LIS, + const RegisterClassInfo &RegClassInfo) : ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), - LIS(LIS) {} - - // TODO: Remove this restriction - bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const { - const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2); - const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst); - return Src2->getReg() == Dst->getReg() && - Src2->getSubReg() == Dst->getSubReg(); - } + LIS(LIS), RegClassInfo(RegClassInfo) {} bool isRewriteCandidate(const MachineInstr &MI) const { - return TII.isMAI(MI) && - AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 && - mfmaHasSameSrc2AndDstReg(MI); + return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; } /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register /// class constraint. This should be nearly identical to /// MachineRegisterInfo::recomputeRegClass. - const TargetRegisterClass * - recomputeRegClassExceptRewritable(Register Reg, - const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC) const; + + /// \p RewriteCandidates will collect the set of MFMA instructions that need + /// to have the opcode mutated to perform the replacement. + /// + /// \p RewriteRegs will accumulate the set of register used by those MFMAs + /// that need to have the register classes adjusted. + const TargetRegisterClass *recomputeRegClassExceptRewritable( + Register Reg, const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC, + SmallVectorImpl &RewriteCandidates, + SmallSetVector &RewriteRegs) const; bool run(MachineFunction &MF) const; }; const TargetRegisterClass * AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( - Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC) const { - - // Accumulate constraints from all uses. - for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { - // Apply the effect of the given operand to NewRC. - MachineInstr *MI = MO.getParent(); - - // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the - // effects of rewrite candidates. It just so happens that we can use either - // AGPR or VGPR in src0/src1, so don't bother checking the constraint - // effects of the individual operands. - if (isRewriteCandidate(*MI)) - continue; + Register StartReg, const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC, + SmallVectorImpl &RewriteCandidates, + SmallSetVector &RewriteRegs) const { + SmallVector Worklist = {StartReg}; + + // Recursively visit all transitive MFMA users + while (!Worklist.empty()) { + Register Reg = Worklist.pop_back_val(); + // Accumulate constraints from all uses. + for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { + // Apply the effect of the given operand to NewRC. + MachineInstr *MI = MO.getParent(); + + // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the + // effects of rewrite candidates. It just so happens that we can use + // either AGPR or VGPR in src0/src1, so don't bother checking the + // constraint effects of the individual operands. + if (isRewriteCandidate(*MI)) { + for (AMDGPU::OpName OpName : + {AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) { + const MachineOperand *Op = TII.getNamedOperand(*MI, OpName); + if (!Op->isReg()) + continue; + + Register OtherReg = Op->getReg(); + if (OtherReg != Reg) { + if (RewriteRegs.insert(OtherReg)) + Worklist.push_back(OtherReg); + } + } + + LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI); + + if (!is_contained(RewriteCandidates, MI)) + RewriteCandidates.push_back(MI); + + continue; + } - unsigned OpNo = &MO - &MI->getOperand(0); - NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); - if (!NewRC || NewRC == OldRC) - return nullptr; + unsigned OpNo = &MO - &MI->getOperand(0); + NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); + if (!NewRC || NewRC == OldRC) { + LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI) + << " cannot be reassigned to AGPR: " << *MI); + return nullptr; + } + } } return NewRC; } +/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a +/// preference to use \p PhysReg first. Returns false if the reassignments +/// cannot be trivially performed. +bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR( + SmallSetVector &InterferingRegs, MCPhysReg PrefPhysReg) const { + // FIXME: The ordering may matter here, but we're just taking uselistorder + // with the special case of ensuring to process the starting instruction + // first. We probably should extract the priority advisor out of greedy and + // use that ordering. + for (Register InterferingReg : InterferingRegs) { + LiveInterval &ReassignLI = LIS.getInterval(InterferingReg); + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + + MCPhysReg Assignable = AMDGPU::NoRegister; + if (EquivalentAGPRRegClass->contains(PrefPhysReg) && + LRM.checkInterference(ReassignLI, PrefPhysReg) == + LiveRegMatrix::IK_Free) { + // First try to assign to the AGPR we were already copying to. This + // should be the first assignment we attempt. We have to guard + // against the use being a subregister (which doesn't have an exact + // class match). + + // TODO: If this does happen to be a subregister use, we should + // still try to assign to a subregister of the original copy result. + Assignable = PrefPhysReg; + } else { + ArrayRef AllocOrder = + RegClassInfo.getOrder(EquivalentAGPRRegClass); + for (MCPhysReg Reg : AllocOrder) { + if (LRM.checkInterference(ReassignLI, Reg) == LiveRegMatrix::IK_Free) { + Assignable = Reg; + break; + } + } + } + + if (!Assignable) { + LLVM_DEBUG(dbgs() << "Unable to reassign VGPR " + << printReg(InterferingReg, &TRI) + << " to a free AGPR\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Reassigning VGPR " << printReg(InterferingReg, &TRI) + << " to " << printReg(Assignable, &TRI) << '\n'); + LRM.assign(ReassignLI, Assignable); + } + + return true; +} + bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // This only applies on subtargets that have a configurable AGPR vs. VGPR // allocation. @@ -145,7 +223,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { LiveInterval &LI = LIS.getInterval(VReg); - // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { if (VNI->isPHIDef() || VNI->isUnused()) continue; @@ -157,55 +234,50 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { if (!DefMI || !DefMI->isFullCopy()) continue; - Register CopySrcReg = DefMI->getOperand(1).getReg(); - if (!CopySrcReg.isVirtual()) + Register MFMADstReg = DefMI->getOperand(1).getReg(); + if (!MFMADstReg.isVirtual()) continue; - LiveInterval &CopySrcLI = LIS.getInterval(CopySrcReg); + LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg); LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); - MachineInstr *CopySrcMI = LIS.getInstructionFromIndex(LRQ.valueIn()->def); - if (!CopySrcMI) + MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def); + if (!MFMA || !isRewriteCandidate(*MFMA)) continue; - int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(CopySrcMI->getOpcode()); - if (AGPROp == -1) + MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2); + if (!Src2->isReg()) continue; - MachineOperand *Src2 = - TII.getNamedOperand(*CopySrcMI, AMDGPU::OpName::src2); + Register Src2Reg = Src2->getReg(); + if (!Src2Reg.isVirtual()) + continue; // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead // of an AGPR or VGPR subclass, so we can't simply use the result on the // assignment. LLVM_DEBUG({ - Register Src2PhysReg = VRM.getPhys(Src2->getReg()); dbgs() << "Attempting to replace VGPR MFMA with AGPR version:" << " Dst=[" << printReg(VReg) << " => " - << printReg(PhysReg, &TRI) << "], Src2=[" - << printReg(Src2->getReg(), &TRI) << " => " - << printReg(Src2PhysReg, &TRI) << "]: " << *CopySrcMI; + << printReg(PhysReg, &TRI) << ']'; + + if (Src2Reg) { + Register Src2PhysReg = VRM.getPhys(Src2Reg); + dbgs() << ", Src2=[" << printReg(Src2Reg, &TRI) << " => " + << printReg(Src2PhysReg, &TRI) << "]: " << *MFMA; + } }); - // If the inputs are tied and the same register, we can shortcut and - // directly replace the register. - if (!Src2->isReg() || Src2->getReg() != CopySrcReg || - Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { - LLVM_DEBUG( - dbgs() - << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); - // TODO: Only handles the tied case for now. If the input operand is a - // different register, we need to also reassign it (either by looking - // for a compatible copy-from-AGPR, or by seeing if an available AGPR is - // compatible with all other uses. - - // If we can't reassign it, we'd need to introduce a different copy - // which is likely worse than the copy we'd be saving. - continue; - } + const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg); - const TargetRegisterClass *Src2VirtRegRC = - MRI.getRegClass(Src2->getReg()); + // src2 and dst have the same physical class constraint; try to preserve + // the original src2 subclass if one were to exist. + SmallVector RewriteCandidates = {MFMA}; + SmallSetVector RewriteRegs; + + // Make sure we reassign the MFMA we found the copy from first. We want + // to ensure dst ends up in the physreg we were originally copying to. + RewriteRegs.insert(MFMADstReg); // We've found av = COPY (MFMA), and need to verify that we can trivially // rewrite src2 to use the new AGPR. If we can't trivially replace it, @@ -213,61 +285,71 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // first place, as well as need to assign another register, and need to // figure out where to put them. The live range splitting is smarter than // anything we're doing here, so trust it did something reasonable. - const TargetRegisterClass *Src2ExceptRC = - recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC, - VirtRegRC); - if (!Src2ExceptRC) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n"); + // + // Note recomputeRegClassExceptRewritable will consider the constraints of + // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. + const TargetRegisterClass *DstExceptRC = + recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC, + RewriteCandidates, RewriteRegs); + if (!DstExceptRC) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of " + << printReg(MFMADstReg, &TRI) << '\n'); continue; } - const TargetRegisterClass *NewSrc2ConstraintRC = - TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); - - // Try to constrain src2 to the replacement instruction candidate's - // register class. - const TargetRegisterClass *NewSrc2RC = - TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); - if (!NewSrc2RC) { - LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) - << " are incompatible with replacement class\n"); - continue; + // If src2 and dst are different registers, we need to also reassign the + // input to an available AGPR if it is compatible with all other uses. + // + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + // + // It's likely that the MFMA is used in sequence with other MFMAs; if we + // cannot migrate the full use/def chain of MFMAs, we would need to + // introduce intermediate copies somewhere. So we only make the + // transform if all the interfering MFMAs can also be migrated. Collect + // the set of rewritable MFMAs and check if we can assign an AGPR at + // that point. + // + // If any of the MFMAs aren't reassignable, we give up and rollback to + // the original register assignments. + + using RecoloringStack = + SmallVector, 8>; + RecoloringStack TentativeReassignments; + + for (Register RewriteReg : RewriteRegs) { + LiveInterval &LI = LIS.getInterval(RewriteReg); + TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); + LRM.unassign(LI); } - MRI.setRegClass(VReg, AssignedRC); - MRI.setRegClass(Src2->getReg(), NewSrc2RC); - - CopySrcMI->setDesc(TII.get(AGPROp)); - - // Perform replacement of the register, rewriting the rewritable uses. - for (MachineInstr &UseMI : - make_early_inc_range(MRI.reg_instructions(CopySrcReg))) { - if (TII.isMAI(UseMI)) { - // Note the register we need to rewrite may still appear in src0/src1, - // but that's fine since those can use A or V anyway. - int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode()); - if (ReplacementOp != -1) - UseMI.setDesc(TII.get(ReplacementOp)); + if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) { + // Roll back the register assignments to the original state. + for (auto [LI, OldAssign] : TentativeReassignments) { + if (VRM.hasPhys(LI->reg())) + LRM.unassign(*LI); + LRM.assign(*LI, OldAssign); } - UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI); + continue; } - LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); - - // We left behind an identity copy, so delete it. - LIS.RemoveMachineInstrFromMaps(*DefMI); - DefMI->eraseFromParent(); - - LRM.unassign(CopySrcLI); + // Fixup the register classes of the virtual registers now that we've + // committed to the reassignments. + for (Register InterferingReg : RewriteRegs) { + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); + } - // We don't need the liveness information anymore, so don't bother - // updating the intervals. Just delete the stale information. - // TODO: Is it worth preserving these? - LIS.removeInterval(CopySrcReg); - LIS.removeInterval(VReg); - LIS.createAndComputeVirtRegInterval(VReg); + for (MachineInstr *RewriteCandidate : RewriteCandidates) { + int NewMFMAOp = + AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); + RewriteCandidate->setDesc(TII.get(NewMFMAOp)); + } + // We likely left an identity copy behind after assignment; let + // VirtRegRewriter deal with it later. MadeChange = true; } } @@ -278,6 +360,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass { public: static char ID; + RegisterClassInfo RegClassInfo; AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) { initializeAMDGPURewriteAGPRCopyMFMALegacyPass( @@ -323,11 +406,13 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction( if (skipFunction(MF.getFunction())) return false; + RegClassInfo.runOnMachineFunction(MF); + auto &VRM = getAnalysis().getVRM(); auto &LRM = getAnalysis().getLRM(); auto &LIS = getAnalysis().getLIS(); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo); return Impl.run(MF); } @@ -337,8 +422,10 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, VirtRegMap &VRM = MFAM.getResult(MF); LiveRegMatrix &LRM = MFAM.getResult(MF); LiveIntervals &LIS = MFAM.getResult(MF); + RegisterClassInfo RegClassInfo; + RegClassInfo.runOnMachineFunction(MF); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo); if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index f8848717808cb..dcb45cc90ca68 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -209,15 +209,14 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -549,18 +548,17 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -636,18 +634,17 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -724,16 +721,15 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -810,16 +806,15 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_16X16X16F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X16F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr2_agpr3_agpr4_agpr5, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -1038,9 +1033,8 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $agpr0_agpr1 - ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 + ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1 + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -1650,18 +1644,17 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -1824,16 +1817,15 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index b35a74e4a80c3..66002fff12155 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -78,10 +78,10 @@ bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 - %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) - %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0) %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> - %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %tmp.1, i32 0, i32 0, i32 0) store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 ret void } @@ -125,9 +125,9 @@ bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 - %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) - %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) - %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0) + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0) store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 ret void } @@ -159,9 +159,9 @@ bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 - %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) - %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) - %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0) + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0) store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 ret void } @@ -193,9 +193,9 @@ bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128 - %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> splat (float 1.0), i32 0, i32 0, i32 0) - %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) - %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0) + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> splat (float 1.000000e+00), i32 0, i32 0, i32 0) + %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0) store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128 ret void } @@ -390,10 +390,136 @@ bb: ret void } +define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { +; CHECK-LABEL: illegal_mfma_after_rewrite: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7] +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v5, v4 +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, v4 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a2 +; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22 +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19] +; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0 +; CHECK-NEXT: global_store_short v[20:21], v23, off +; CHECK-NEXT: buffer_wbl2 sc0 sc1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7] +; CHECK-NEXT: global_store_short v[20:21], v15, off +; CHECK-NEXT: buffer_wbl2 sc0 sc1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: global_store_short v[20:21], v14, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16 +; CHECK-NEXT: buffer_wbl2 sc0 sc1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: global_store_short v[20:21], v14, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: buffer_wbl2 sc0 sc1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: global_store_short v[20:21], v12, off +; CHECK-NEXT: buffer_wbl2 sc0 sc1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: global_store_short v[20:21], v0, off +; CHECK-NEXT: s_endpgm +entry: + %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() + %i2 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %k0, i32 0, i32 0, i32 0) + %i4 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH3C00), <4 x float> %k0, i32 0, i32 0, i32 0) + %i6 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH7E00), <4 x float> %k0, i32 0, i32 0, i32 0) + %i5 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> splat (float 0x7FF8000000000000), i32 0, i32 0, i32 0) + %k = call <4 x float> asm sideeffect "; def $0", "=v"() + %i1 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %k, i32 0, i32 0, i32 0) + %i7 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH3C00), <4 x float> %k, i32 0, i32 0, i32 0) + %i17 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i1, i32 0, i32 0, i32 0) + %i19 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i4, i32 0, i32 0, i32 0) + %c_thread_buf.0 = extractelement <4 x float> %i19, i64 0 + %conv.0 = fptrunc float %c_thread_buf.0 to half + store half %conv.0, ptr addrspace(1) null, align 2 + fence seq_cst + %i22 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i5, i32 0, i32 0, i32 0) + %c_thread_buf.1 = extractelement <4 x float> %i22, i64 0 + %conv1 = fptrunc float %c_thread_buf.1 to half + store half %conv1, ptr addrspace(1) null, align 2 + fence seq_cst + %i23 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i6, i32 0, i32 0, i32 0) + %c_thread_buf.2 = extractelement <4 x float> %i23, i64 0 + %conv2 = fptrunc float %c_thread_buf.2 to half + store half %conv2, ptr addrspace(1) null, align 2 + fence seq_cst + %i25 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i2, i32 0, i32 0, i32 0) + %c_thread_buf.3 = extractelement <4 x float> %i25, i64 0 + %conv3 = fptrunc float %c_thread_buf.3 to half + store half %conv3, ptr addrspace(1) null, align 2 + fence seq_cst + %i26 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0) + %i27 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> splat (half 0xH3C00), <4 x half> zeroinitializer, <4 x float> %i26, i32 0, i32 0, i32 0) + %c_thread_buf.4 = extractelement <4 x float> %i27, i64 0 + %conv4 = fptrunc float %c_thread_buf.4 to half + store half %conv4, ptr addrspace(1) null, align 2 + fence seq_cst + %i31 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> splat (half 0xH7E00), <4 x half> zeroinitializer, <4 x float> %i17, i32 0, i32 0, i32 0) + %c_thread_buf.5 = extractelement <4 x float> %i31, i64 0 + %conv5 = fptrunc float %c_thread_buf.5 to half + store half %conv5, ptr addrspace(1) null, align 2 + ret void +} + declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #2 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #2 declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #3 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="4,4" } -attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { mustprogress nofree norecurse nounwind willreturn "amdgpu-waves-per-eu"="8,8" } +attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }