NVIDIA
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td‎
Lines changed: 12 additions & 4 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Conversion/LowerLinalgCopies/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎mlir-tensorrt/compiler/lib/Conversion/LowerLinalgCopies/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Conversion/LowerLinalgCopies/LowerLinalgCopies.cpp‎
Lines changed: 64 additions & 0 deletions b/‎mlir-tensorrt/compiler/lib/Conversion/LowerLinalgCopies/LowerLinalgCopies.cpp‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp‎
Lines changed: 15 additions & 91 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp‎
Lines changed: 15 additions & 91 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt‎
Lines changed: 5 additions & 4 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt‎
Lines changed: 5 additions & 4 deletions
@@ -65,13 +65,11 @@ def ConvertStablehloToTensorRTPass : Pass<"convert-stablehlo-to-tensorrt"> {
       "prefer converting to 'tensorrt.einsum' over 'tensorrt.matrix_multiply'">
   ];
 }
-#endif // MLIR_TENSORRT_ENABLE_HLO
 
 //===----------------------------------------------------------------------===//
 // ChloToStableHloExt
 //===----------------------------------------------------------------------===//
 
-#ifdef MLIR_TENSORRT_ENABLE_HLO
 def ConvertChloToStableHloExtPass : Pass<"convert-chlo-to-stablehlo-ext"> {
   let summary = "Convert specific CHLO operations to stablehlo";
   let description = [{
@@ -89,9 +87,8 @@ def ConvertChloToStableHloExtPass : Pass<"convert-chlo-to-stablehlo-ext"> {
       "do not convert chlo.topk ops">,
   ];
 }
-#endif // MLIR_TENSORRT_ENABLE_HLO
-
 
+#endif // MLIR_TENSORRT_ENABLE_HLO
 
 //===----------------------------------------------------------------------===//
 // HostToEmitC
@@ -145,6 +142,17 @@ def ConvertTensorRTToEmitCPass : Pass<"convert-tensorrt-to-emitc",
   let dependentDialects = ["::mlir::emitc::EmitCDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// LowerLinalgCopiesPass
+//===----------------------------------------------------------------------===//
+
+def LowerLinalgCopiesPass : Pass<"lower-linalg-copies"> {
+  let summary = "Lower linalg.copy to memref.copy or other operations";
+  let description = [{
+    This pass lowers `linalg.copy` to `memref.copy`.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ConvertMemRefToCUDAPass
 //===----------------------------------------------------------------------===//
 
@@ -13,6 +13,7 @@ add_subdirectory(CUDAToLLVM)
 add_subdirectory(HostToEmitC)
 add_subdirectory(HostToLLVM)
 add_subdirectory(LLVMCommon)
+add_subdirectory(LowerLinalgCopies)
 add_subdirectory(MemRefToCUDA)
 add_subdirectory(PlanToExecutor)
 add_subdirectory(PlanToLLVM)
 
@@ -0,0 +1,9 @@
+add_mlir_tensorrt_library(MLIRTensorRTLowerLinalgCopies
+  LowerLinalgCopies.cpp
+
+  LINK_LIBS PUBLIC
+  MLIRLinalgDialect
+  MLIRMemRefDialect
+  MLIRPass
+  MLIRTransformUtils
+)
@@ -0,0 +1,64 @@
+//===- LowerLinalgCopies.cpp ----------------------------------------------===//
+//
+// SPDX-FileCopyrightText: Copyright 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Implementation of `lower-linalg-copies` pass.
+///
+//===----------------------------------------------------------------------===//
+#include "mlir-tensorrt/Conversion/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_LOWERLINALGCOPIESPASS
+#include "mlir-tensorrt/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+struct LowerLinalgCopyPattern : public OpRewritePattern<linalg::CopyOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(linalg::CopyOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!op.hasPureBufferSemantics())
+      return rewriter.notifyMatchFailure(op, "expected pure buffer semantics");
+    rewriter.replaceOpWithNewOp<memref::CopyOp>(op, op.getInputs().front(),
+                                                op.getOutputs().front());
+    return success();
+  }
+};
+
+class LowerLinalgCopiesPass
+    : public impl::LowerLinalgCopiesPassBase<LowerLinalgCopiesPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<LowerLinalgCopyPattern>(ctx);
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+
+} // namespace
@@ -63,81 +63,6 @@ using bufferization::OneShotAnalysisState;
 using bufferization::func_ext::FuncAnalysisState;
 using bufferization::func_ext::FuncOpAnalysisState;
 
-namespace {
-
-/// Simplify a func.return operand produced by
-/// `materialize_in_dest(cast(materialize_in_dest(..., %alloc)), %out_arg)` so
-/// that only the single `materialize_in_dest` is used directly into the block
-/// argument.
-struct RemoveRedundantMaterializeInDestPattern
-    : OpRewritePattern<bufferization::MaterializeInDestinationOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(bufferization::MaterializeInDestinationOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!op->hasOneUse() || !isa<func::ReturnOp>(*op->user_begin()))
-      return failure();
-
-    auto dest = dyn_cast<BlockArgument>(op.getDest());
-    auto castOp = op.getSource().getDefiningOp<tensor::CastOp>();
-    auto funcOp = op->getParentOfType<func::FuncOp>();
-    if (!castOp || !dest || !funcOp ||
-        dest.getOwner() != &funcOp.getBody().front())
-      return failure();
-
-    auto producer =
-        castOp.getSource()
-            .getDefiningOp<bufferization::MaterializeInDestinationOp>();
-    if (!producer || !producer->hasOneUse() ||
-        !producer.getDest().hasOneUse() ||
-        !producer.getDest().getDefiningOp<bufferization::AllocTensorOp>())
-      return failure();
-
-    // Replace the returned value with the result of the cast.
-    Location loc = op->getLoc();
-    rewriter.replaceOp(op, castOp);
-
-    // Create a new cast on the block arg to the type of the producer alloc
-    // result.
-    rewriter.setInsertionPoint(producer);
-    auto blockArgCast = rewriter.create<tensor::CastOp>(
-        loc, producer.getDest().getType(), dest);
-    // Update the producer materialization to materialize into the block arg.
-    rewriter.replaceOp(producer.getDest().getDefiningOp(), blockArgCast);
-    return success();
-  }
-};
-
-/// Rewrite `tensor.empty` to `bufferization.alloc_tensor` in the `device`
-/// memory space.
-struct RewriteEmptyTensor : public OpRewritePattern<tensor::EmptyOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::EmptyOp op,
-                                PatternRewriter &rewriter) const override {
-    auto memorySpace =
-        dyn_cast_or_null<MemorySpaceAttr>(op.getType().getEncoding());
-    if (!memorySpace)
-      return failure();
-    rewriter.replaceOpWithNewOp<bufferization::AllocTensorOp>(
-        op, op.getType(), op.getDynamicSizes(),
-        /*copy=*/Value{}, /*size_hint=*/Value{}, memorySpace);
-    return success();
-  }
-};
-
-/// Drop `bufferization.alloc_tensor` operations that do not have uses.
-struct CleanupAllocTensorOps
-    : public OpRewritePattern<bufferization::AllocTensorOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(bufferization::AllocTensorOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!op->use_empty())
-      return failure();
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-} // namespace
-
 /// Creates a DPS argument of type `argType` in the first block of `func` by
 /// appending to the end of current arguments. It then updates the function
 /// type, adds a `executor.result_arg` argument attribute to the new arg, and
@@ -701,7 +626,12 @@ static void uniqueEmptyTensorUses(RewriterBase &rewriter, ModuleLikeOp op) {
       return WalkResult::advance();
     if (nestedOp->hasOneUse())
       return WalkResult::advance();
+    unsigned firstUse = true;
     for (OpOperand &use : llvm::make_early_inc_range(emptyOp->getUses())) {
+      if (firstUse) {
+        firstUse = false;
+        continue;
+      }
       rewriter.setInsertionPoint(use.getOwner());
       auto clonedOp = cast<tensor::EmptyOp>(rewriter.clone(*emptyOp));
       use.assign(clonedOp);
@@ -748,22 +678,16 @@ class AllocTensorsPass
       return signalPassFailure();
     }
 
-    // Eliminate any straggling `tensor.empty` operations. Only run this on
-    // functions in the host module.
-    {
-      FrozenRewritePatternSet patterns = [&]() {
-        RewritePatternSet patterns_(ctx);
-        patterns_.insert<RewriteEmptyTensor, CleanupAllocTensorOps,
-                         RemoveRedundantMaterializeInDestPattern>(ctx);
-        return patterns_;
-      }();
-      for (FunctionOpInterface func : op.getOps<FunctionOpInterface>()) {
-        if (failed(applyPatternsGreedily(func, patterns))) {
-          op->emitError() << "failed to run " << getArgument() << " patterns";
-          return signalPassFailure();
-        }
-      }
-    }
+    // Remove leftover empty tensors.
+    op->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+      if (ModuleLikeOp(nestedOp) && nestedOp != op)
+        return WalkResult::skip();
+      auto emptyOp = dyn_cast<tensor::EmptyOp>(nestedOp);
+      if (!emptyOp || !emptyOp.use_empty())
+        return WalkResult::advance();
+      rewriter.eraseOp(emptyOp);
+      return WalkResult::skip();
+    });
   }
 };
 } // namespace
@@ -28,7 +28,10 @@ add_mlir_tensorrt_library(MLIRTensorRTPlanTransforms
 
   LINK_LIBS PUBLIC
 
+  MLIRBufferizationDialect
   MLIRBufferizationPipelines
+  MLIRBufferizationToMemRef
+  MLIRBufferizationTransforms
   MLIRExecutorGenericClustering
   MLIRFuncTransforms
   MLIRIR
@@ -37,21 +40,19 @@ add_mlir_tensorrt_library(MLIRTensorRTPlanTransforms
   MLIRSCFDialect
   MLIRTensorDialect
   MLIRTensorRTAnalysis
+  MLIRTensorRTBufferizationScopeInterface
   MLIRTensorRTCUDADialect
   MLIRTensorRTDialect
   MLIRTensorRTDuplicateFunctionElimination
   MLIRTensorRTExecutorDialect
-  MLIRBufferizationDialect
-  MLIRBufferizationTransforms
+  MLIRTensorRTLowerLinalgCopies
   MLIRTensorRTMemRefCastElimination
   MLIRTensorRTPlanAnalysis
   MLIRTensorRTPlanDialect
   MLIRTensorRTStableHloExtTransforms
   MLIRTensorRTStablehloScalarToArith
   MLIRTensorRTStablehloToTensorRT
   MLIRTensorRTTensorRTRuntimeDialect
-  MLIRTensorRTBufferizationScopeInterface
-  MLIRBufferizationToMemRef
   MLIRTransforms
   StablehloOps
 )