NVIDIA
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/MaterializeShapeCalculations.cpp‎
Lines changed: 8 additions & 6 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/MaterializeShapeCalculations.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/StableHloExt/IR/StableHloReifyTypeInterfaceImpl.cpp‎
Lines changed: 47 additions & 0 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/StableHloExt/IR/StableHloReifyTypeInterfaceImpl.cpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Dialect/StableHloExt/Transforms/ConstantFolding.cpp‎
Lines changed: 154 additions & 0 deletions b/‎mlir-tensorrt/compiler/lib/Dialect/StableHloExt/Transforms/ConstantFolding.cpp‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎mlir-tensorrt/tensorrt/lib/TensorRT/Transforms/BroadcastElimination.cpp‎
Lines changed: 64 additions & 2 deletions b/‎mlir-tensorrt/tensorrt/lib/TensorRT/Transforms/BroadcastElimination.cpp‎
Lines changed: 64 additions & 2 deletions
@@ -356,34 +356,37 @@ struct SimplifyExtractOfReshape : public OpRewritePattern<tensor::ExtractOp> {
 
   LogicalResult matchAndRewrite(tensor::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
-    SmallVector<Value> operands;
+
     auto reshapeOp = op.getTensor().getDefiningOp<stablehlo::ReshapeOp>();
     if (!reshapeOp)
       return failure();
 
+    // Skip if either shape has dynamic dimensions
+    if (!reshapeOp.getOperand().getType().hasStaticShape())
+      return failure();
+
     std::optional<SmallVector<int64_t>> coords =
         getConstantIntValues(getAsOpFoldResult(op.getIndices()));
     if (!coords)
       return failure();
 
-    // Get lienar coords.
     SmallVector<int64_t> resultBasis =
         mlir::computeSuffixProduct(reshapeOp.getType().getShape());
     SmallVector<int64_t> operandBasis =
         mlir::computeSuffixProduct(reshapeOp.getOperand().getType().getShape());
 
-    int64_t lienarIndex = mlir::linearize(*coords, resultBasis);
+    int64_t linearIndex = mlir::linearize(*coords, resultBasis);
     SmallVector<int64_t> operandCoords =
-        mlir::delinearize(lienarIndex, operandBasis);
+        mlir::delinearize(linearIndex, operandBasis);
 
-    // Find linear offset within in the operand shape.
     rewriter.replaceOpWithNewOp<tensor::ExtractOp>(
         op, reshapeOp.getOperand(),
         llvm::map_to_vector(operandCoords, [&](int64_t c) -> Value {
           return rewriter.create<arith::ConstantIndexOp>(op->getLoc(), c);
         }));
 
     return success();
+
   }
 };
 
@@ -858,7 +861,6 @@ class MaterializeShapeCalculationsPass
       memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns_);
       stablehlo_ext::populateStableHloAbsorbTensorCastPatterns(patterns_);
       stablehlo::populateStablehloCanonicalizeDynamismPatterns(&patterns_, ctx);
-
       // clang-format off
       addCanonicalizationPatterns<
         arith::AndIOp,
 
@@ -280,6 +280,47 @@ class ConvolutionReifyRankedShapedTypeOpInterfaceImpl
   }
 };
 
+class SelectReifyRankedShapedTypeOpInterfaceImpl
+    : public ReifyRankedShapedTypeOpInterface::ExternalModel<
+          SelectReifyRankedShapedTypeOpInterfaceImpl,
+          stablehlo::SelectOp> {
+    
+  public:
+  LogicalResult
+  reifyResultShapes(Operation *op_, OpBuilder &builder,
+                    ReifiedRankedShapedTypeDims &reifiedReturnShapes) const {
+                    
+    auto op = cast<stablehlo::SelectOp>(op_);
+    Location loc = op.getLoc();
+
+    // Get result type
+    auto resultType = cast<RankedTensorType>(op.getResult().getType());
+    int64_t rank = resultType.getRank();
+
+    // Collect dimension values
+    SmallVector<OpFoldResult> dims(rank);
+    for (int64_t i = 0; i < rank; ++i) {
+      // For each dimension, if it's static in the result type, use that
+      if (!resultType.isDynamicDim(i)) {
+        dims[i] = builder.getIndexAttr(resultType.getDimSize(i));
+        continue;
+      }
+
+      // For dynamic dimensions, we need to compute the broadcasted size
+      // The operands are: pred, on_true, on_false
+      Value trueVal = builder.createOrFold<tensor::DimOp>(loc, op.getOperand(1), i);
+      Value falseVal = builder.createOrFold<tensor::DimOp>(loc, op.getOperand(2), i);
+      
+      // The result dimension should be the max of the two values
+      Value maxDim = builder.create<arith::MaxSIOp>(loc, trueVal, falseVal);
+      dims[i] = maxDim;
+    }
+    reifiedReturnShapes.emplace_back(std::move(dims));
+    return success();
+
+  }
+  
+};
 class ReduceWindowReifyRankedShapedTypeOpInterfaceImpl
     : public ReifyRankedShapedTypeOpInterface::ExternalModel<
           ReduceWindowReifyRankedShapedTypeOpInterfaceImpl,
@@ -353,4 +394,10 @@ void stablehlo::registerTypeInferenceExternalModels(DialectRegistry &registry) {
         stablehlo::ReduceWindowOp::attachInterface<
             ReduceWindowReifyRankedShapedTypeOpInterfaceImpl>(*ctx);
       });
+  registry.addExtension(
+      +[](MLIRContext *ctx, stablehlo::StablehloDialect *dialect) {
+        stablehlo::SelectOp::attachInterface<
+            SelectReifyRankedShapedTypeOpInterfaceImpl>(*ctx);
+      });
+    
 }
@@ -29,6 +29,7 @@
 #include "mlir/Dialect/CommonFolders.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "stablehlo/dialect/TypeInference.h"
@@ -1064,6 +1065,11 @@ struct AbsorbTensorCastProducer : public RewritePattern {
 };
 } // namespace
 
+
+/// Populates patterns that are temporarily reproduced here from upstream
+/// commits we have not yet integrated.
+static void populateFutureUpstreamPatterns(RewritePatternSet &patterns);
+
 void stablehlo_ext::populateStableHloAbsorbTensorCastPatterns(
     RewritePatternSet &patterns) {
   patterns.add<AbsorbTensorCastProducer>(patterns.getContext());
@@ -1108,6 +1114,7 @@ class ConstantFoldingPass
         SqrtOpFolder
       >(ctx);
     // clang-format on
+    populateFutureUpstreamPatterns(patterns);
     populateStableHloAbsorbTensorCastPatterns(patterns);
     stablehlo::populateStablehloCanonicalizationPatterns(ctx, &patterns);
     tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
@@ -1124,3 +1131,150 @@ class ConstantFoldingPass
   }
 };
 } // namespace
+
+//===----------------------------------------------------------------------===//
+/// The  patterns below this point are reproduced from
+/// https://github.com/openxla/stablehlo/commit/5d15ab064f165cc6773ef4ba949ac083ae8e1fea,
+/// which is in upstream, but our current pinned StableHlo commit is not there
+/// yet. The patterns can be removed in the next StableHLO upgrade.
+///
+//===----------------------------------------------------------------------===//
+
+///
+/// In cases where a concat is fed into a slice, it
+/// is possible the concat can be simplified or bypassed. This checks which
+/// inputs to the concat are used by the slice, either reducing the number of
+/// concatenated values or entirely removes the concat. Pattern:
+/// slice(concat(X,Y,Z,...),...) -> concat(slice(X),slice(Y),slice(Z))
+struct SimplifySliceOfConcat : public OpRewritePattern<SliceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SliceOp slice,
+                                PatternRewriter &rewriter) const override {
+    RankedTensorType resultTy = slice.getType();
+    if (!resultTy.hasStaticShape())
+      return rewriter.notifyMatchFailure(slice, "result shape not static");
+
+    auto concat = slice.getOperand().getDefiningOp<ConcatenateOp>();
+    if (!concat)
+      return rewriter.notifyMatchFailure(slice, "slice input not concat");
+
+    RankedTensorType concatType = concat.getType();
+    uint64_t dimension = concat.getDimension();
+
+    ArrayRef<int64_t> start = slice.getStartIndices();
+    ArrayRef<int64_t> limit = slice.getLimitIndices();
+
+    int64_t sliceStart = start[dimension];
+    int64_t sliceLimit = limit[dimension];
+
+    // We need to determine what inputs from the concat affect the slice, and
+    // how the bounds of the slice need to be updated for the minimally required
+    // inputs.
+    int64_t runningSize = 0;
+    int64_t frontOffset = concatType.getShape()[dimension];
+
+    auto subsetStart = concat.operand_end();
+    auto subsetEnd = concat.operand_end();
+    for (auto it = concat.operand_begin(); it < concat.operand_end(); ++it) {
+      Value input = *it;
+      auto inputTy = cast<RankedTensorType>(input.getType());
+      if (inputTy.isDynamicDim(dimension))
+        return rewriter.notifyMatchFailure(
+            slice, "concat input has dynamic dimension");
+
+      int64_t dimSize = inputTy.getShape()[dimension];
+
+      // If this position is in the slice its the start of the subset and we
+      // need to update the start and limit values.
+      if (runningSize + dimSize > sliceStart &&
+          subsetStart == concat.operand_end()) {
+        subsetStart = it;
+        frontOffset = runningSize;
+      }
+
+      // Determine the last required offset.
+      if (runningSize < sliceLimit) {
+        subsetEnd = it + 1;
+      }
+
+      runningSize += dimSize;
+    }
+
+    auto subsetSize = subsetEnd - subsetStart;
+    // We need all inputs so no optimization.
+    if (subsetSize == concat.getNumOperands())
+      return rewriter.notifyMatchFailure(slice,
+                                         "slice needs all concat inputs");
+
+    // If there's nothing to slice that means the output is an empty tensor and
+    // there is dead code. We do nothing here and rely on other passes to clean
+    // this up.
+    if (subsetSize == 0)
+      return rewriter.notifyMatchFailure(slice, "slice is empty");
+
+    if (subsetSize > 1 && !concat.getResult().hasOneUse())
+      return rewriter.notifyMatchFailure(slice,
+                                         "slice is not the only concat user");
+
+    auto concatRange = OperandRange(subsetStart, subsetEnd);
+    auto newConcat = rewriter.create<ConcatenateOp>(
+        concat.getLoc(), concatRange, concat.getDimension());
+
+    SmallVector<int64_t> newStart(start);
+    SmallVector<int64_t> newLimit(limit);
+    newStart[dimension] -= frontOffset;
+    newLimit[dimension] -= frontOffset;
+
+    rewriter.replaceOpWithNewOp<SliceOp>(
+        slice, newConcat, rewriter.getDenseI64ArrayAttr(newStart),
+        rewriter.getDenseI64ArrayAttr(newLimit), slice.getStrides());
+    return success();
+  }
+};
+
+/// Flatten sequential concatenations as long as the parent concatenation either
+/// has a single use or is <= 32 elements.
+class SimplifyConcatOfConcatPattern
+    : public OpRewritePattern<stablehlo::ConcatenateOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConcatenateOp op,
+                                PatternRewriter &rewriter) const override {
+    auto getFlattenedOperands = [&](const Value &val) -> ValueRange {
+      auto definingOp = dyn_cast_or_null<ConcatenateOp>(val.getDefiningOp());
+      if (!definingOp || definingOp.getDimension() != op.getDimension())
+        return val;
+      if (definingOp->hasOneUse())
+        return definingOp.getInputs();
+      if (!definingOp.getType().hasStaticShape())
+        return val;
+      if (definingOp.getType().getNumElements() <= 32)
+        return definingOp.getInputs();
+      return val;
+    };
+
+    bool needToFlatten = false;
+    int operandCount = 0;
+    for (Value val : op.getInputs()) {
+      ValueRange result = getFlattenedOperands(val);
+      if (result.size() != 1 || result[0] != val)
+        needToFlatten = true;
+      operandCount += result.size();
+    }
+    if (!needToFlatten)
+      return rewriter.notifyMatchFailure(op, "no need to flatten");
+
+    llvm::SmallVector<Value, 6> newOperands;
+    newOperands.reserve(operandCount);
+    for (Value operand : op.getInputs())
+      llvm::append_range(newOperands, getFlattenedOperands(operand));
+
+    rewriter.modifyOpInPlace(op, [&] { op->setOperands(newOperands); });
+    return success();
+  }
+};
+
+void populateFutureUpstreamPatterns(RewritePatternSet &patterns) {
+  patterns.add<SimplifySliceOfConcat, SimplifyConcatOfConcatPattern>(
+      patterns.getContext());
+}
@@ -120,6 +120,65 @@ struct PushDownBroadcastReduceRankOp : public OpRewritePattern<CollapseRankOp> {
 };
 } // namespace
 
+static Value expandRank(RewriterBase &rewriter, Location loc,
+                        TypedValue<RankedTensorType> input,
+                        ArrayRef<int64_t> reorderedBroadcastDims,
+                        RankedTensorType resultType) {
+  RankedTensorType inputType = input.getType();
+  // For <= 1 dynamic dims, no need to do dynamic reshape.
+  if (input.getType().getNumDynamicDims() <= 1) {
+    SmallVector<int64_t> staticShape(resultType.getRank());
+
+    unsigned inputIdx = 0;
+    for (unsigned i = 0, e = staticShape.size(); i < e; i++) {
+      if (inputIdx < reorderedBroadcastDims.size() &&
+          i == reorderedBroadcastDims[inputIdx]) {
+        staticShape[i] = inputType.getDimSize(inputIdx++);
+        continue;
+      }
+      staticShape[i] = 1;
+    }
+    return rewriter.create<ReshapeOp>(loc, resultType.clone(staticShape),
+                                      input);
+  }
+
+  // Otherwise, we need to do dynamic reshape.
+  auto shape = rewriter.create<tensorrt::ShapeOp>(loc, input);
+  SmallVector<Value> shapeComponents(resultType.getRank());
+  SmallVector<int64_t> staticShape(resultType.getRank());
+  unsigned inputIdx = 0;
+  for (unsigned i = 0, e = shapeComponents.size(); i < e; i++) {
+    if (inputIdx < reorderedBroadcastDims.size() &&
+        i == reorderedBroadcastDims[inputIdx]) {
+      if (!inputType.isDynamicDim(inputIdx)) {
+        staticShape[i] = inputType.getDimSize(inputIdx);
+        shapeComponents[i] = rewriter.create<tensorrt::ConstantOp>(
+            loc, rewriter.getI32TensorAttr(
+                     {static_cast<int32_t>(inputType.getDimSize(inputIdx++))}));
+        continue;
+      }
+      shapeComponents[i] = rewriter.create<tensorrt::SliceOp>(
+          loc, shape,
+          /*offset=*/ArrayRef<int32_t>{static_cast<int32_t>(inputIdx++)},
+          ArrayRef<int32_t>{1}, ArrayRef<int32_t>{1});
+      staticShape[i] = ShapedType::kDynamic;
+      continue;
+    }
+    staticShape[i] = 1;
+    shapeComponents[i] = rewriter.create<tensorrt::ConstantOp>(
+        loc, rewriter.getI32TensorAttr(
+                 {static_cast<int32_t>(inputType.getDimSize(1))}));
+  }
+  auto newShape = rewriter.create<tensorrt::ConcatenationOp>(
+      loc,
+      RankedTensorType::get(static_cast<int64_t>(shapeComponents.size()),
+                            rewriter.getI32Type()),
+      shapeComponents, /*axis=*/0);
+
+  return rewriter.create<ReshapeOp>(loc, resultType.clone(staticShape), input,
+                                    newShape);
+}
+
 namespace {
 /// Create transpose + expand_rank on the input of a `tensorrt.broadcast` so
 /// that the result has the same rank as the `tensorrt.broadcast` result and the
@@ -157,8 +216,9 @@ struct SimplifyBroadcast : public OpRewritePattern<BroadcastOp> {
         }
         expandedShape[i] = 1;
       }
-      Value expanded = rewriter.create<ExpandRankOp>(
-          loc, resultType.clone(expandedShape), transposeOp);
+
+      Value expanded = expandRank(rewriter, loc, transposeOp,
+                                  reorderedBroadcastDims, resultType);
       rewriter.replaceOpWithNewOp<BroadcastOp>(
           op, op.getType(), expanded, op.getShape(),
           llvm::to_vector(llvm::seq<int64_t>(0, resultType.getRank())));
@@ -341,6 +401,8 @@ class BroadcastEliminationPass
     patterns.add<SimplifyBroadcast, ElementwiseAbsorbBroadcast,
                  PushDownBroadcastReduceRankOp, SelectAbsorbBroadcast,
                  MatMulAbsorbBroadcast>(&getContext());
+    tensorrt::ReshapeOp::getCanonicalizationPatterns(patterns,
+                                                     patterns.getContext());
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       emitError(getOperation()->getLoc())