NVIDIA
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/StableHloToExecutable.h‎
Lines changed: 4 additions & 0 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/StableHloToExecutable.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/IR/PlanOps.td‎
Lines changed: 103 additions & 24 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/IR/PlanOps.td‎
Lines changed: 103 additions & 24 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h‎
Lines changed: 2 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td‎
Lines changed: 14 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/Plan/IR/PlanOps.cpp‎
Lines changed: 88 additions & 28 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/Plan/IR/PlanOps.cpp‎
Lines changed: 88 additions & 28 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp‎
Lines changed: 6 additions & 4 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp‎
Lines changed: 6 additions & 4 deletions
@@ -128,6 +128,10 @@ struct StableHLOToExecutableOptions : public mlir::OptionsContext {
   /// Whether to disallow host tensors in TensorRT clusters.
   bool disallowHostTensorsInTensorRTClusters = false;
 
+  /// Use non-DPS style calling convention for entrypoint function
+  /// and backend types that support allocating results.
+  bool enableNonDPSReturns = false;
+
   /// Entrypoint function name.
   std::string entrypoint = "main";
 
 
@@ -131,11 +131,41 @@ def Plan_InlineGroupOp : Plan_GroupOpBase<"inline_group", [
 }
 
 //===----------------------------------------------------------------------===//
-// InlineClosedGroupOp
+// Plan_InlineClosedGroupBase
 //===----------------------------------------------------------------------===//
 
-def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
-  IsolatedFromAbove,
+class Plan_InlineClosedGroupBase<string mnemonic, list<Trait> traits = []> :
+    Plan_GroupOpBase<mnemonic, traits # [IsolatedFromAbove]> {
+
+  code baseInlineClosedExtraClassDeclaration = baseExtraClassDeclaration # [{
+    // Common methods for both DPS and non-DPS versions
+    bool argHasTensorType(unsigned inputIdx) {
+      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
+      return isa<RankedTensorType>(getInputs()[inputIdx].getType());
+    }
+
+    BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
+      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
+      return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
+    }
+
+    /// Populate the `input_attrs` from an array of BoundsAttrs.
+    void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
+      setInputAttrsAttr(::mlir::ArrayAttr::get(
+        getOperation()->getContext(),
+        ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
+      ));
+    }
+  }];
+
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
+}
+
+//===----------------------------------------------------------------------===//
+// Plan_InlineClosedGroupOp
+//===----------------------------------------------------------------------===//
+
+def Plan_InlineClosedGroupOp : Plan_InlineClosedGroupBase<"inline_closed_group", [
   AttrSizedOperandSegments,
   DestinationStyleOpInterface,
   SingleBlockImplicitTerminator<"plan::YieldOp">,
@@ -226,24 +256,12 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
                    CArg<"ArrayRef<BoundsAttr>", "{}">:$res_attrs)>
   ];
 
-  let extraClassDeclaration = baseExtraClassDeclaration # [{
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration # [{
 
     MutableOperandRange getDpsInitsMutable() {
       return getOutsMutable();
     }
 
-    /// Returns true if the `i-th` input argument has a tensor type.
-    bool argHasTensorType(unsigned inputIdx) {
-      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
-      return isa<RankedTensorType>(getInputs()[inputIdx].getType());
-    }
-
-    /// Returns the i-th input argument's bounds attribute.
-    BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
-      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
-      return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
-    }
-
     ArrayRef<BlockArgument> getRegionOutArgs() {
       return getBody().getArguments().take_back(getOuts().size());
     }
@@ -255,16 +273,77 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
         ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
       ));
     }
+  }];
+}
 
-    /// Populate the `input_attrs` from an array of BoundsAttrs.
-    void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
-      setInputAttrsAttr(::mlir::ArrayAttr::get(
-        getOperation()->getContext(),
-        ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
-      ));
-    }
+//===----------------------------------------------------------------------===//
+// InlineClosedAllocGroupOp
+//===----------------------------------------------------------------------===//
+
+def Plan_InlineClosedAllocGroupOp : Plan_InlineClosedGroupBase<"inline_closed_alloc_group", [
+  IsolatedFromAbove,
+  SingleBlockImplicitTerminator<"plan::YieldOp">,
+  DeclareOpInterfaceMethods<RegionBranchOpInterface,
+    ["getEntrySuccessorOperands"]>,
+  DeclareOpInterfaceMethods<OpAsmOpInterface,
+    ["getAsmBlockArgumentNames"]>
+]> {
+  let description = [{
+  The `plan.inline_closed_alloc_group` operation is a variant of the
+  `plan.inline_closed_group` operation that does not use destination-passing style
+  (DPS). It is isolated from above and explicitly captures input operands, but unlike
+  its DPS counterpart, it does not capture destination operands because its results must
+  be lowered to allocation(s). The allocations may or may not be of a size that can only 
+  be computed inside of the region.
+  This operation takes input operands and their corresponding bounds attributes, 
+  and produces results. The `input_attrs` hold bounds attribute information for 
+  the input operands. The absence of bounds information is allowed (`none` bounds).
+
+  The `target` attribute specifies the execution target for the group.    
+
+  #### Example
+
+  Consider the following simple program containing operations with dynamically shaped operands:
+
+  ```mlir
+  %0 = ... : tensor<?xf32> // A dynamically shaped operand
+  %1 = ... : index         // A dynamic calculation of %0's extent
+
+  %2 = plan.inline_closed_alloc_group target(#plan.cluster_target<tensorrt>)
+    inputs(%0, %1 : tensor<?xf32>, index)
+    in_attrs [#plan.bounds<shape, [10], [20]>, #plan.bounds<none>]-> tensor<?xf32> {
+    %3 = plan.with_shape %0 (%1) : (tensor<?xf32>, index) -> tensor<?xf32>
+    %4 = stablehlo.exponential %3 : tensor<?xf32>
+    yield %4 : tensor<?xf32>
+  }
 
   }];
+  let arguments = (ins Variadic<AnyTypeOf<[AnyRankedTensor, AnySignlessIntegerOrIndex]>>:$inputs,
+                       BoundsAttrArray:$input_attrs,
+                       AnyAttr:$target);
+
+  let results = (outs Variadic<AnyTypeOf<[AnyRankedTensor]>>:$results);
+
+  let assemblyFormat = [{
+    `target` `(` $target `)` `\n`
+    `inputs` `(` ( $inputs^ `:` type($inputs) `)` ) : ( `)` ) ?  `\n`
+    `in_attrs` $input_attrs `\n`
+     attr-dict-with-keyword `->` type($results)
+     $body
+  }];
+
+  let hasVerifier = 1;
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<(ins "TypeRange":$results, 
+                   "Attribute":$target,
+                   "ValueRange":$inputs,
+                   CArg<"ArrayRef<BoundsAttr>", "{}">:$input_attrs)>,
+  ];
+
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
 }
 
 //===----------------------------------------------------------------------===//
@@ -276,7 +355,7 @@ def Plan_YieldOp : Plan_Op<"yield", [
       Terminator,
       ReturnLike,
       ParentOneOf<["plan::InlineGroupOp",
-                   "plan::InlineClosedGroupOp"]>]> {
+                   "plan::InlineClosedGroupOp", "plan::InlineClosedAllocGroupOp"]>]> {
 
   let arguments = (ins Variadic<AnyType>:$results);
 
 
@@ -69,7 +69,8 @@ executorOneShotModuleBufferize(ModuleOp targetOp,
                                const ExecutorBufferizationOptions &options);
 
 /// Build a pipeline (targeting ModuleOp) for bufferization.
-void buildPlanBufferizationPipeline(OpPassManager &pm);
+void buildPlanBufferizationPipeline(
+    OpPassManager &pm, const plan::PlanAllocTensorsPassOptions &options);
 
 /// Build a post-bufferization pipeline that performs optimizations on memrefs.
 void buildPlanBufferOptimizationPipeline(OpPassManager &pm);
 
@@ -248,6 +248,9 @@ def StablehloClusteringPass : Pass<"stablehlo-clustering", "::mlir::ModuleOp"> {
     Option<"entrypoint", "entrypoint", "std::string", "\"\"",
       "the name of the entrypoint function; if empty then the clustering runs"
       " on all functions">,
+    Option<"enableNonDPSReturns",
+      "enable-non-dps-returns", "bool", "false",
+      "allow backend clusters to directly allocate outputs">,      
     Option<"disallowHostTensorsInTensorRTClusters",
       "disallow-host-tensors-in-tensorrt-clusters", "bool", "false",
       "don't cluster host tensors in TensorRT clusters">,
@@ -332,7 +335,10 @@ def CreateClosedRegionsPass : Pass<"plan-create-closed-regions", "::mlir::Module
     Option<"testPreWalkOrder", "test-pre-walk-order", "bool", "false",
       "(used only in testing) specifies to outline regions by walking in "
       " pre-order; used for verifying results are not sensitive "
-      "to traversal order">
+      "to traversal order">,
+    Option<"enableNonDPSReturns", "enable-non-dps-returns", "bool", 
+           /*default=*/"false", 
+           "Allow backend clusters to directly allocate outputs">
   ];
 
   let dependentDialects = [
@@ -428,6 +434,13 @@ def PlanAllocTensorsPass : Pass<"plan-alloc-tensors",
     "::mlir::bufferization::BufferizationDialect",
     "::mlir::plan::PlanDialect"
   ];
+
+  let options = [
+    Option<"enableNonDPSReturns", "enable-non-dps-returns", "bool", 
+           /*default=*/"false", 
+           "Allow backend clusters to directly allocate outputs">
+  ];
+
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -295,7 +295,7 @@ void InlineGroupOp::getSuccessorRegions(
 }
 
 //===----------------------------------------------------------------------===//
-// InlineClosedGroupOp
+// InlineClosedGroupOp and InlineClosedAllocGroupOp Helpers
 //===----------------------------------------------------------------------===//
 
 static LogicalResult
@@ -371,36 +371,38 @@ verifyBoundsAttr(StringRef argOrResult, unsigned idx, Type type,
   return success();
 }
 
-LogicalResult InlineClosedGroupOp::verify() {
-  SmallVector<BoundsAttr> inputAttrs =
-      llvm::to_vector(getInputAttrs().getAsRange<BoundsAttr>());
-  if (inputAttrs.size() != getInputs().size())
-    return emitOpError("expected number of inputs (")
-           << getInputs().size()
-           << " to equal the number of input_attrs BoundsAttrs ("
-           << inputAttrs.size() << ")";
-
-  for (auto [idx, type] : llvm::enumerate(TypeRange(getInputs()))) {
-    BoundsAttr boundsAttr = inputAttrs[idx];
-    if (failed(verifyBoundsAttr("input argument", idx, type, boundsAttr,
-                                [&]() { return emitOpError(); })))
+static LogicalResult verifyBoundsAttrs(Operation *op, ValueRange operands,
+                                       ArrayAttr attrsArray, StringRef attrName,
+                                       StringRef boundName) {
+  SmallVector<BoundsAttr> attrs =
+      llvm::to_vector(attrsArray.getAsRange<BoundsAttr>());
+  if (attrs.size() != operands.size())
+    return op->emitOpError("expected number of ")
+           << attrName << " (" << operands.size() << ") to equal the number of "
+           << boundName << " BoundsAttrs (" << attrs.size() << ")";
+
+  for (auto [idx, type] : llvm::enumerate(TypeRange(operands))) {
+    BoundsAttr boundsAttr = attrs[idx];
+    if (failed(verifyBoundsAttr(attrName, idx, type, boundsAttr,
+                                [&]() { return op->emitOpError(); })))
       return failure();
   }
 
-  SmallVector<BoundsAttr> resAttrs =
-      llvm::to_vector(getResAttrs().getAsRange<BoundsAttr>());
-  if (resAttrs.size() != getNumResults())
-    return emitOpError("expected number of results (")
-           << getNumResults()
-           << ") to equal the number of res_attrs BoundsAttrs ("
-           << resAttrs.size() << ")";
-
-  for (auto [idx, type] : llvm::enumerate(getResultTypes())) {
-    BoundsAttr boundsAttr = resAttrs[idx];
-    if (failed(verifyBoundsAttr("result", idx, type, boundsAttr,
-                                [&]() { return emitOpError(); })))
-      return failure();
-  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InlineClosedGroupOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult InlineClosedGroupOp::verify() {
+  if (failed(verifyBoundsAttrs(getOperation(), getInputs(), getInputAttrs(),
+                               "inputs", "input_attrs")))
+    return failure();
+
+  if (failed(verifyBoundsAttrs(getOperation(), getResults(), getResAttrs(),
+                               "results", "result_attrs")))
+    return failure();
 
   return success();
 }
@@ -465,6 +467,64 @@ void InlineClosedGroupOp::build(OpBuilder &b, OperationState &state,
   state.addTypes(TypeRange(outs));
 }
 
+//===----------------------------------------------------------------------===//
+// InlineClosedAllocGroupOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult InlineClosedAllocGroupOp::verify() {
+  Operation *op = getOperation();
+  // Check for res_attrs
+  if (op->hasAttr("res_attrs"))
+    return op->emitOpError("must not contain 'res_attrs' attribute");
+  return verifyBoundsAttrs(op, getInputs(), getInputAttrs(), "inputs",
+                           "input_attrs");
+}
+
+void InlineClosedAllocGroupOp::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  // If the predecessor is the InlineClosedGroupOp, branch into the body.
+  if (point.isParent()) {
+    regions.push_back(RegionSuccessor(&getBody(), getBody().getArguments()));
+    return;
+  }
+  // Otherwise, the region branches back to the parent operation.
+  regions.push_back(RegionSuccessor(getResults()));
+}
+
+OperandRange
+InlineClosedAllocGroupOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+  return getOperands();
+}
+
+void InlineClosedAllocGroupOp::getAsmBlockArgumentNames(
+    Region &region, OpAsmSetValueNameFn setNameFn) {
+  assert(region.getNumArguments() == getInputs().size() &&
+         "expected one block arg for each input argument");
+  for (BlockArgument arg : region.getArguments())
+    setNameFn(arg, "in");
+}
+
+void InlineClosedAllocGroupOp::build(OpBuilder &b, OperationState &state,
+                                     TypeRange resultTypes, Attribute target,
+                                     ValueRange inputs,
+                                     ArrayRef<BoundsAttr> input_attrs) {
+  state.addTypes(resultTypes);
+  state.addOperands(inputs);
+  state.getOrAddProperties<Properties>().target = target;
+  state.getOrAddProperties<Properties>().setInputAttrs(b.getArrayAttr(
+      SmallVector<Attribute>(input_attrs.begin(), input_attrs.end())));
+  Region *body = state.addRegion();
+  auto getLocs = [](ValueRange r) {
+    SmallVector<Location> locs;
+    locs.reserve(r.size());
+    for (Value v : r)
+      locs.push_back(v.getLoc());
+    return locs;
+  };
+  (void)body->emplaceBlock();
+  body->addArguments(TypeRange(inputs), getLocs(inputs));
+}
+
 //===----------------------------------------------------------------------===//
 // YieldOp
 //===----------------------------------------------------------------------===//
 
@@ -856,11 +856,13 @@ class AllocTensorsPass
       }
     }
 
-    // First rewrite public functions to conform to DPS style.
     IRRewriter rewriter(ctx);
-    if (failed(rewriteNotPrivateFuncsToDPS(rewriter, op))) {
-      op->emitError("Failed to convert non-private functions to DPS");
-      return signalPassFailure();
+    if (!enableNonDPSReturns) {
+      // First rewrite public functions to conform to DPS style.
+      if (failed(rewriteNotPrivateFuncsToDPS(rewriter, op))) {
+        op->emitError("Failed to convert non-private functions to DPS");
+        return signalPassFailure();
+      }
     }
 
     // Rewrite SCF for and while loop bodies for better bufferization results,
Original file line number	Diff line number	Diff line change
`@@ -856,11 +856,13 @@ class AllocTensorsPass`
`856`	`856`	`}`
`857`	`857`	`}`
`858`	`858`
`859`		`- // First rewrite public functions to conform to DPS style.`
`860`	`859`	`IRRewriter rewriter(ctx);`
`861`		`- if (failed(rewriteNotPrivateFuncsToDPS(rewriter, op))) {`
`862`		`- op->emitError("Failed to convert non-private functions to DPS");`
`863`		`- return signalPassFailure();`
	`860`	`+ if (!enableNonDPSReturns) {`
	`861`	`+ // First rewrite public functions to conform to DPS style.`
	`862`	`+ if (failed(rewriteNotPrivateFuncsToDPS(rewriter, op))) {`
	`863`	`+ op->emitError("Failed to convert non-private functions to DPS");`
	`864`	`+ return signalPassFailure();`
	`865`	`+ }`
`864`	`866`	`}`
`865`	`867`
`866`	`868`	`// Rewrite SCF for and while loop bodies for better bufferization results,`