diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 8b1d4ba68a444..aaecfa1e33f06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,6 +14,10 @@ /// MFMA opcode. /// /// TODO: +/// - Handle rewrites of phis. This must be more careful than normal about the +/// reassignment. We do not want to introduce an AGPR-to-AGPR copy inside of a +/// loop, so it depends on the exact assignment of the copy. +/// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// //===----------------------------------------------------------------------===// @@ -60,6 +64,25 @@ class AMDGPURewriteAGPRCopyMFMAImpl { return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; } + /// Find AV_* registers assigned to AGPRs (or virtual registers which were + /// already required to be AGPR). + /// + /// \return the assigned physical register that \p VReg is assigned to if it + /// is an AGPR, otherwise MCRegister(). + MCRegister getAssignedAGPR(Register VReg) const { + MCRegister PhysReg = VRM.getPhys(VReg); + if (!PhysReg) + return MCRegister(); + + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + return TRI.isAGPRClass(AssignedRC) ? PhysReg : MCRegister(); + } + + bool tryReassigningMFMAChain(MachineInstr &MFMA, Register MFMAHintReg, + MCPhysReg PhysRegHint) const; + /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register /// class constraint. This should be nearly identical to @@ -74,6 +97,8 @@ class AMDGPURewriteAGPRCopyMFMAImpl { Register Reg, SmallVectorImpl &RewriteCandidates, SmallSetVector &RewriteRegs) const; + bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const; + bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const; bool run(MachineFunction &MF) const; }; @@ -154,6 +179,87 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( return true; } +bool AMDGPURewriteAGPRCopyMFMAImpl::tryReassigningMFMAChain( + MachineInstr &MFMA, Register MFMAHintReg, MCPhysReg PhysRegHint) const { + // src2 and dst have the same physical class constraint; try to preserve + // the original src2 subclass if one were to exist. + SmallVector RewriteCandidates = {&MFMA}; + SmallSetVector RewriteRegs; + + // Make sure we reassign the MFMA we found the copy from first. We want + // to ensure dst ends up in the physreg we were originally copying to. + RewriteRegs.insert(MFMAHintReg); + + // We've found av = COPY (MFMA) (or MFMA (v = COPY av)) and need to verify + // that we can trivially rewrite src2 to use the new AGPR. If we can't + // trivially replace it, we're going to induce as many copies as we would have + // emitted in the first place, as well as need to assign another register, and + // need to figure out where to put them. The live range splitting is smarter + // than anything we're doing here, so trust it did something reasonable. + // + // Note recomputeRegClassExceptRewritable will consider the constraints of + // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. + if (!recomputeRegClassExceptRewritable(MFMAHintReg, RewriteCandidates, + RewriteRegs)) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " + << printReg(MFMAHintReg, &TRI) << '\n'); + return false; + } + + // If src2 and dst are different registers, we need to also reassign the + // input to an available AGPR if it is compatible with all other uses. + // + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + // + // It's likely that the MFMA is used in sequence with other MFMAs; if we + // cannot migrate the full use/def chain of MFMAs, we would need to + // introduce intermediate copies somewhere. So we only make the + // transform if all the interfering MFMAs can also be migrated. Collect + // the set of rewritable MFMAs and check if we can assign an AGPR at + // that point. + // + // If any of the MFMAs aren't reassignable, we give up and rollback to + // the original register assignments. + + using RecoloringStack = + SmallVector, 8>; + RecoloringStack TentativeReassignments; + + for (Register RewriteReg : RewriteRegs) { + LiveInterval &LI = LIS.getInterval(RewriteReg); + TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); + LRM.unassign(LI); + } + + if (!attemptReassignmentsToAGPR(RewriteRegs, PhysRegHint)) { + // Roll back the register assignments to the original state. + for (auto [LI, OldAssign] : TentativeReassignments) { + if (VRM.hasPhys(LI->reg())) + LRM.unassign(*LI); + LRM.assign(*LI, OldAssign); + } + + return false; + } + + // Fixup the register classes of the virtual registers now that we've + // committed to the reassignments. + for (Register InterferingReg : RewriteRegs) { + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); + } + + for (MachineInstr *RewriteCandidate : RewriteCandidates) { + int NewMFMAOp = + AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); + RewriteCandidate->setDesc(TII.get(NewMFMAOp)); + } + + return true; +} + /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a /// preference to use \p PhysReg first. Returns false if the reassignments /// cannot be trivially performed. @@ -206,6 +312,77 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR( return true; } +/// Identify copies that look like: +/// %vdst:vgpr = V_MFMA_.. %src0:av, %src1:av, %src2:vgpr +/// %agpr = COPY %vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover the common case. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesToAGPR( + Register VReg, MCRegister AssignedAGPR) const { + bool MadeChange = false; + for (MachineInstr &UseMI : MRI.def_instructions(VReg)) { + if (!UseMI.isCopy()) + continue; + + Register CopySrcReg = UseMI.getOperand(1).getReg(); + if (!CopySrcReg.isVirtual()) + continue; + + // TODO: Handle loop phis copied to AGPR. e.g. + // + // loop: + // %phi:vgpr = COPY %mfma:vgpr + // %mfma:vgpr = V_MFMA_xxx_vgprcd_e64 %a, %b, %phi + // s_cbranch_vccnz loop + // + // endloop: + // %agpr = mfma + // + // We need to be sure that %phi is assigned to the same physical register as + // %mfma, or else we will just be moving copies into the loop. + + for (MachineInstr &CopySrcDefMI : MRI.def_instructions(CopySrcReg)) { + if (isRewriteCandidate(CopySrcDefMI) && + tryReassigningMFMAChain( + CopySrcDefMI, CopySrcDefMI.getOperand(0).getReg(), AssignedAGPR)) + MadeChange = true; + } + } + + return MadeChange; +} + +/// Identify copies that look like: +/// %src:vgpr = COPY %src:agpr +/// %vdst:vgpr = V_MFMA_... %src0:av, %src1:av, %src:vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover rarer cases, and will generally be +/// redundant with tryFoldCopiesToAGPR. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR( + Register VReg, MCRegister AssignedAGPR) const { + bool MadeChange = false; + for (MachineInstr &UseMI : MRI.use_instructions(VReg)) { + if (!UseMI.isCopy()) + continue; + + Register CopyDstReg = UseMI.getOperand(0).getReg(); + if (!CopyDstReg.isVirtual()) + continue; + + for (MachineInstr &CopyUseMI : MRI.use_instructions(CopyDstReg)) { + if (isRewriteCandidate(CopyUseMI)) { + if (tryReassigningMFMAChain(CopyUseMI, CopyDstReg, + VRM.getPhys(CopyDstReg))) + MadeChange = true; + } + } + } + + return MadeChange; +} + bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // This only applies on subtargets that have a configurable AGPR vs. VGPR // allocation. @@ -222,124 +399,14 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VReg = Register::index2VirtReg(I); - Register PhysReg = VRM.getPhys(VReg); - if (!PhysReg) + MCRegister AssignedAGPR = getAssignedAGPR(VReg); + if (!AssignedAGPR) continue; - // Find AV_* registers assigned to AGPRs. - const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.hasAGPRs(VirtRegRC)) - continue; - - const TargetRegisterClass *AssignedRC = VirtRegRC; - if (TRI.hasVGPRs(VirtRegRC)) { - // If this is an AV register, we have to check if the actual assignment is - // to an AGPR - AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; - } - - LiveInterval &LI = LIS.getInterval(VReg); - - for (VNInfo *VNI : LI.vnis()) { - if (VNI->isPHIDef() || VNI->isUnused()) - continue; - - MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); - if (!DefMI || !DefMI->isCopy()) - continue; - - Register MFMADstReg = DefMI->getOperand(1).getReg(); - if (!MFMADstReg.isVirtual()) - continue; - - LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg); - LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); - MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def); - if (!MFMA || !isRewriteCandidate(*MFMA)) - continue; - - // src2 and dst have the same physical class constraint; try to preserve - // the original src2 subclass if one were to exist. - SmallVector RewriteCandidates = {MFMA}; - SmallSetVector RewriteRegs; - - // Make sure we reassign the MFMA we found the copy from first. We want - // to ensure dst ends up in the physreg we were originally copying to. - RewriteRegs.insert(MFMADstReg); - - // We've found av = COPY (MFMA), and need to verify that we can trivially - // rewrite src2 to use the new AGPR. If we can't trivially replace it, - // we're going to induce as many copies as we would have emitted in the - // first place, as well as need to assign another register, and need to - // figure out where to put them. The live range splitting is smarter than - // anything we're doing here, so trust it did something reasonable. - // - // Note recomputeRegClassExceptRewritable will consider the constraints of - // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. - if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates, - RewriteRegs)) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " - << printReg(MFMADstReg, &TRI) << '\n'); - continue; - } - - // If src2 and dst are different registers, we need to also reassign the - // input to an available AGPR if it is compatible with all other uses. - // - // If we can't reassign it, we'd need to introduce a different copy - // which is likely worse than the copy we'd be saving. - // - // It's likely that the MFMA is used in sequence with other MFMAs; if we - // cannot migrate the full use/def chain of MFMAs, we would need to - // introduce intermediate copies somewhere. So we only make the - // transform if all the interfering MFMAs can also be migrated. Collect - // the set of rewritable MFMAs and check if we can assign an AGPR at - // that point. - // - // If any of the MFMAs aren't reassignable, we give up and rollback to - // the original register assignments. - - using RecoloringStack = - SmallVector, 8>; - RecoloringStack TentativeReassignments; - - for (Register RewriteReg : RewriteRegs) { - LiveInterval &LI = LIS.getInterval(RewriteReg); - TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); - LRM.unassign(LI); - } - - if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) { - // Roll back the register assignments to the original state. - for (auto [LI, OldAssign] : TentativeReassignments) { - if (VRM.hasPhys(LI->reg())) - LRM.unassign(*LI); - LRM.assign(*LI, OldAssign); - } - - continue; - } - - // Fixup the register classes of the virtual registers now that we've - // committed to the reassignments. - for (Register InterferingReg : RewriteRegs) { - const TargetRegisterClass *EquivalentAGPRRegClass = - TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); - MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); - } - - for (MachineInstr *RewriteCandidate : RewriteCandidates) { - int NewMFMAOp = - AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); - RewriteCandidate->setDesc(TII.get(NewMFMAOp)); - } - - // We likely left an identity copy behind after assignment; let - // VirtRegRewriter deal with it later. + if (tryFoldCopiesToAGPR(VReg, AssignedAGPR)) + MadeChange = true; + if (tryFoldCopiesFromAGPR(VReg, AssignedAGPR)) MadeChange = true; - } } return MadeChange; diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 6dcfda3117234..1c5e0e362e359 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -69,9 +69,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -97,8 +97,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,10 +126,10 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -200,62 +200,3 @@ body: | GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... - -# Degenerate case. Copy from AGPR to VGPR is dead undef subreg def ---- -name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2 -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - - ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: SI_RETURN - %0:vreg_64_align2 = COPY $vgpr4_vgpr5 - %1:av_64_align2 = COPY $vgpr0_vgpr1 - %2:av_64_align2 = COPY $vgpr2_vgpr3 - %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) - %4:vreg_128_align2 = COPY %3 - undef %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1) - SI_RETURN -... - -# Degenerate case. Copy from AGPR to VGPR is dead, but same register -# is redefined as whole register. ---- -name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2 -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - - ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: SI_RETURN - %0:vreg_64_align2 = COPY $vgpr4_vgpr5 - %1:av_64_align2 = COPY $vgpr0_vgpr1 - %2:av_64_align2 = COPY $vgpr2_vgpr3 - %3:areg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1) - %4:vreg_64_align2 = COPY %3 - %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORDX2 %0, %4, 0, 0, implicit $exec :: (store (s64), addrspace 1) - SI_RETURN -... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index e22775eecca24..72f6cb32d1b66 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -305,14 +305,14 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub0_sub1 - ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3 - ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) + ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 + ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 + ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 81613f69c982b..343a5c8511ee9 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -598,9 +598,11 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 ; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 ; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 ; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 +; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 +; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 ; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] ; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 @@ -611,9 +613,12 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -637,19 +642,10 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 @@ -678,58 +674,26 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v34, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33] +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31] ; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 ; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33] +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -749,15 +713,12 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64(double %arg0, d ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_and_b32_e32 v8, 0x3ff, v31 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v8 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() @@ -776,18 +737,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a0 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[10:11] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] ; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[4:5], v[6:7], v[0:1] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() @@ -807,32 +765,16 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg( ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> @@ -851,32 +793,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_ ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16 +; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15 +; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14 +; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13 +; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12 +; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11 +; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10 +; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9 +; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8 +; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7 +; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6 +; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5 +; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32>