Use block loads for post-dpas vector computation 4/4

alexbaden · alexbaden · commit a312d98ea774 · 2024-12-13T01:14:41.000Z
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -558,6 +558,12 @@ struct LoadOpConversion
         delinearize(rewriter, loc, warpId, warpsPerCTA, dpasOrder);
 
     if (hasDpasLayout) {
+      // A block load with the DPAS layout but without the DotDpasLayout is
+      // expected to follow the ordering of the DPAS output. For a 2D block
+      // load, the rows are distributed across work items/SIMD lanes and the
+      // column vectors are available for each work item to process. This layout
+      // aligns to the DPAS layout as the DPAS operation output layout
+      // distributes rows across work items.
       if (isTransposeRequired) {
         // TODO: this would likely require a shuffle to match the expected
         // ordering coming out of the DPAS layout and requires more