NVIDIA
diff --git a/‎mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td‎
Lines changed: 4 additions & 0 deletions b/‎mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/Transforms/Passes.td‎
Lines changed: 73 additions & 84 deletions b/‎mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/Transforms/Passes.td‎
Lines changed: 73 additions & 84 deletions
diff --git a/‎mlir-tensorrt/tensorrt/lib/TensorRT/Transforms/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions b/‎mlir-tensorrt/tensorrt/lib/TensorRT/Transforms/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions
@@ -3790,6 +3790,10 @@ def TensorRT_ReshapeOp : TensorRT_Op<"reshape",
   let extraClassDeclaration = [{
     /// Returns true if created op is valid for TensorRT major version.
     bool isValidForTensorRTVersion(int64_t trtMajorVersion);
+
+    /// Get canonicalization patterns which rewrite as ReshapeOp and
+    /// do NOT include rewrites which not get to a different kind of Op
+    /// (e.g. ExpandRankOp, CollapseRankOp).
     static void getCanonicalizationPatternsSameOp(RewritePatternSet &results, MLIRContext *context);
   }] # baseClassDeclaration;
 }
 
@@ -175,97 +175,86 @@ def LegalizeInt8Pass : Pass<"tensorrt-legalize-int8", "func::FuncOp"> {
   let dependentDialects = ["::mlir::tensorrt::TensorRTDialect"];
 }
 
-//===----------------------------------------------------------------------===//
-// TransposeEliminationPass
-//===----------------------------------------------------------------------===//
-def TransposeEliminationPass : Pass<"tensorrt-transpose-elimination"> {
-  let summary = "try to eliminate tensorrt.transpose operations";
-
-  let description = [{
-
-    It is well-known that excessive number of transpose ops (either
-    "tensorrt.transpose" or "tensorrt.shuffle" operations with identity reshape)
-    can cause performance issues with TensorRT. This commonly occurs when the
-    input source being converted represents convolutions in "NHWC" format vs.
-    TensorRT's preferred "NCHW" format. In the conversion of these types of
-    convolutions, a number of transpose operations must be inserted. These
-    transpose operations can prevent fusions. For example, a transpose operation
-    between a convolution and a pointwise addition can prevent convolution-bias
-    fusion.
-
-    This pass tries to eliminate transpose operations by applying the following
-    patterns in a greedy manner:
-
-    1) rotating `tensorrt.transpose` "forward" certain computational operations,
-    especially `tensorrt.element_wise` ops. This means that the transpose will
-    be applied to the result of the elementwise operation as well as the other
-    branch of the operation. To avoid an infinite ping-pong application of this
-    pattern certain heuristics are applied to determine whether or not this is
-    beneficial. For example:
-
-    ```
-    func.func @transpose_pushdown_switch(%arg0: tensor<2x2xf32>, %arg1: tensor<1x2xf32>)
-                  -> tensor<2x2xf32> {
-      %1 = tensorrt.transpose {
-        permutation = affine_map<(d0, d1)->(d1, d0)>
-      } %arg0 : tensor<2x2xf32> to tensor<2x2xf32>
-      %2 = tensorrt.element_wise <kSUM> (
-        %1, %arg1: tensor<2x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-      return %2 : tensor<2x2xf32>
-    }
-    ```
-
-    becomes
-
-    ```
-    func.func @transpose_pushdown_switch(%arg0: tensor<2x2xf32>,
-        %arg1: tensor<1x2xf32>) -> tensor<2x2xf32> {
-      %0 = tensorrt.transpose {permutation = #map}
-        %arg1 : tensor<1x2xf32> to tensor<2x1xf32>
-      %1 = tensorrt.element_wise <kSUM>
-        (%arg0, %0 : tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2x2xf32>
-      %2 = tensorrt.transpose {permutation = #map} %1 : tensor<2x2xf32> to tensor<2x2xf32>
-      return %2 : tensor<2x2xf32>
-    }
-    ```
-
-    In this case, moving the transpose to the other branch results in lower
-    memory cost on the inputs, but higher total memory cost (because a transpose
-    on the result is also added). However, we always prefer to push transpose
-    operations as far forward as possible in this transformation.
-
-    2) Const-folding transpose operations. Often, it is undesirable to let
-    weights be transposed at runtime. Instead, weights should be pre-transformed
-    to put them into a form that is suitable for TensorRT convolutions.
-    Therefore, we apply global transpose-const folding. This can be quite
-    expensive for large weights but is important to reduce runtime transpose
-    costs.
-
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// ReshapeEliminationPass
-//===----------------------------------------------------------------------===//
-def ReshapeEliminationPass : Pass<"tensorrt-reshape-elimination"> {
-  let summary = "try to eliminate tensorrt.reshape operations";
-
-  let description = [{
-    Reshape elimination pass captures pattern with un-necessary reshape and
-    simplifies it by eliminating reshape operations.
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // TransposeReshapeEliminationPass
 //===----------------------------------------------------------------------===//
 def TransposeReshapeEliminationPass : Pass<"tensorrt-transpose-reshape-elimination"> {
   let summary = "try to eliminate tensorrt.transpose, tensorrt.reshape, and tensorrt.shuffle operations";
 
   let description = [{
-    Push tensorrt.transpose and tensorrt.reshape operations around to attempt to eleminate them
-    and merge them with other ops such as matrix multiply.  The intention is to improve
-    pattern matching and fusion inside of TensorRT.
+    It is well-known that excessive number of transpose or reshapes ops (either
+    "tensorrt.transpose", "tensorrt.reshape" or "tensorrt.shuffle")
+    can cause performance issues with TensorRT. For example, this commonly occurs
+    when the input source being converted represents convolutions in "NHWC" format
+    vs. TensorRT's preferred "NCHW" format. In the conversion of these types of
+    convolutions, a number of transpose operations must be inserted. These
+    transpose operations can prevent fusions. For example, a transpose operation
+    between a convolution and a pointwise addition can prevent convolution-bias
+    fusion.  Fusions can also be blocked if a reshape is placed between a
+    matrix multiplication and an activation.
+
+    The pass tries to eliminate transposes and reshapes by pushing the transposes
+    and reshapes around to combine them into identity transposes and reshapes which
+    can be eliminated from the program.  To accomplish this, this pass uses several
+    rewrite rules that push transposes and reshapes from the output of an Operation
+    to the operation's input, and vice-versa.  The rewrites currently included in this
+    pass handle common cases, though currently does not every possible scenario---one
+    may wish to extend this pass in the future as needed.
+
+    The process is as follows:
+    1) Normalize transpose, reshape, shuffle and matrix multiply into a common set of ops.
+       - shuffle(x) -> reshape(transpose(reshape(x)))
+       - matrix_multiply(x, y) -> einsum("ij,jk->ik", x, y)
+       - expand_rank(x) -> reshape(x)
+       - collapse_rank(x) -> reshape(x)
+    2) Push down reshape and transpose, eliminating when possible. E.g. op(transpose(x)) -> transpose(op(x))
+       - einsum(transpose(x), ...) -> einsum(x, ...)
+       - einsum(...) -> transpose(einsum(...))      Pull transposes out of einsum (to try to match matrix multiply pattern)
+       - einsum(reshape(x), y, ...) -> transpose(reshape(einsum(x, reshape(transpose(y)), ...))
+       - unary(transpose(x)) -> transpose(unary(x))
+       - activation(transpose(x)) -> transpose(activation(x))
+       - identity_op(transpose(x)) -> transpose(identity_op(x))
+       - activation(reshape(x)) -> reshape(activation(x))
+       - unary(reshape(x)) -> reshape(unary(x))
+       - identity_op(reshape(x)) -> reshape(identity_op(x))
+       - reshape(transpose(x)) -> transpose(reshape(x)) if possible put reshape before transpose
+       - dequantize(quantize(transpose(x))) -> transpose(dequantize(quantize((x))) if the scale is 0-dim
+       - dequantize(quantize(reshape(x))) -> reshape(dequantize(quantize(x))) if the scale is 0-dim
+       - reshape(reshape(x)) -> reshape(x)
+       - transpose(transpose(x)) -> transpose(x)
+       - reshape(x) -> x if reshape is identity
+       - transpose(x) -> x if transpose is identity
+       - elementwise(reshape(a), b) -> reshape(elementwise(a, reshape(b))) conditioned on heuristic
+       - elementwise(transpose(a), b) -> transpose(elementwise(a, transpose(b)))
+    3) Push up reshape and transpose, eliminating when possible.  E.g. transpose(op(x)) -> op(transpose(x))
+       - transpose(einsum(...)) -> einsum(...)
+       - einsum(...) -> einsum(transpose(x), ...)   Pull transposes out of einsum (to try to match matrix multiply pattern)
+       - reshape(einsum(...)) -> einsum(reshape(transpose(x)), ...)
+       - transpose(activation(x)) -> activation(transpose(x))
+       - transpose(unary(x)) -> unary(transpose(x))
+       - transpose(identity_op(x)) -> identity_op(transpose(x))
+       - reshape(activation(x)) -> activation(reshape(x))
+       - reshape(unary(x)) -> unary(reshape(x))
+       - reshape(identity_op(x)) -> identity_op(reshape(x))
+       - reshape(reshape(x)) -> reshape(x)
+       - transpose(transpose(x)) -> transpose(x)
+       - reshape(x) -> x if reshape is identity
+       - transpose(x) -> x if transpose is identity
+       - transpose(reshape(x)) -> reshape(transpose(x)) if possible put transpose before reshape
+       - transpose(dequantize(quantize(x))) -> dequantize(quantize(transpose(x))) if the scale is 0-dim
+       - reshape(dequantize(quantize(x))) -> dequantize(quantize(reshape(x))) if the scale is 0-dim
+       - reshape(elementwise(a, b)) -> elementwise(reshape(a), reshape(b))
+       - transpose(elementwise(a, b)) -> elementwise(transpose(a), transpose(b))
+    4) Final clean up.  Fuse leftover transposes and reshapes with other ops.
+       - einsum("ij,jk->ik", x, y) -> matrix_multiply(x, y) if einsum matches a matrix multiply pattern
+       - matrix_multiply(transpose(x), y) -> matrix_multiply(x, y) merge transpose if possible
+       - transpose(einsum(...)) -> einsum(...)
+       - einsum(tranpose(x), ...) -> einsum(...)
+       - einsum(collapse_rank(x), ...) -> einsum(...)
+       - expand_rank(einsum(...)) -> einsum(...)
+
+    To avoid an infinite ping-pong application of these patterns, heuristics are
+    applied to determine when a pattern is beneficial.
   }];
 }
 
 
@@ -19,8 +19,6 @@ add_mtrtd_library(MLIRTensorRTTransforms
   Passes.cpp
   RaiseActivations.cpp
   RaiseNormalizations.cpp
-  ReshapeElimination.cpp
-  TransposeElimination.cpp
   TransposeReshapeElimination.cpp
 
   DEPENDS