diff --git a/mlir-tensorrt/.gitignore b/mlir-tensorrt/.gitignore
index e410819ad..8f46753fb 100644
--- a/mlir-tensorrt/.gitignore
+++ b/mlir-tensorrt/.gitignore
@@ -3,6 +3,7 @@
 *.log
 **/llvm-project/**
 **/llvm-project/
+CMakeUserPresets.json
 
 # Docs build artifacts
 /public/
diff --git a/mlir-tensorrt/CMakeLists.txt b/mlir-tensorrt/CMakeLists.txt
index 0a396a862..b00019933 100644
--- a/mlir-tensorrt/CMakeLists.txt
+++ b/mlir-tensorrt/CMakeLists.txt
@@ -90,6 +90,35 @@ if(PROJECT_IS_TOP_LEVEL)
   set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 endif()
 
+# -------------------------------------------------
+# Option validation
+# -------------------------------------------------
+# Write out a header file containing convenience macros for each flag.
+function(mtrt_write_feature_flags_header)
+  set(feature_flags_header
+    "${CMAKE_CURRENT_BINARY_DIR}/include/mlir-tensorrt/Features.h")
+
+  # Generate the header at configure time
+  file(WRITE "${feature_flags_header}" [[
+  // Auto-generated feature macros, do not edit.
+  #ifndef MLIR_TENSORRT_FEATURES_H
+  #define MLIR_TENSORRT_FEATURES_H
+
+  ]])
+
+  foreach(FEATURE IN LISTS MLIR_TRT_FEATURE_FLAGS)
+      file(APPEND "${feature_flags_header}" "#ifdef ${FEATURE}\n")
+      file(APPEND "${feature_flags_header}" "#define IF_${FEATURE}(code) do { code } while (0)\n")
+      file(APPEND "${feature_flags_header}" "#else\n")
+      file(APPEND "${feature_flags_header}" "#define IF_${FEATURE}(code) do {} while (0)\n")
+      file(APPEND "${feature_flags_header}" "#endif // ${FEATURE}\n\n")
+  endforeach()
+  file(APPEND "${feature_flags_header}" "#endif // MLIR_TENSORRT_FEATURES_H\n")
+endfunction()
+
+mtrt_write_feature_flags_header()
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/include")
+
 # -------------------------------------------------
 # Setup LLVM/MLIR
 # -------------------------------------------------
diff --git a/mlir-tensorrt/build_tools/cmake/Targets.cmake b/mlir-tensorrt/build_tools/cmake/Targets.cmake
index 9e361c783..4a16e0e91 100644
--- a/mlir-tensorrt/build_tools/cmake/Targets.cmake
+++ b/mlir-tensorrt/build_tools/cmake/Targets.cmake
@@ -44,6 +44,16 @@ function(add_mlir_tensorrt_public_c_api_library target)
   endif()
 endfunction()
 
+# --------------------------------------------------------------
+# Adds an upstream MLIR library target to the
+# MLIR_TENSORRT_LIBS global property list to capture it as an
+# implicit dependency for all final tools and compiler
+# end-user products.
+# --------------------------------------------------------------
+function(add_mlir_tensorrt_compiler_dependency target)
+  set_property(GLOBAL APPEND PROPERTY MLIR_TENSORRT_LIBS ${target})
+endfunction()
+
 # ------------------------------------------------------------------------------
 # A wrapper around `add_mlir_dialect_library` that also appends the dialect
 # library to the global `MLIR_TENSORRT_DIALECT_LIBS` list property.
@@ -99,11 +109,11 @@ function(add_mlir_tensorrt_backend_library target)
     BASE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   set(LLVM_TARGET_DEFINITIONS "${SRC_TD}")
-  
+
   string(REPLACE ".td" "Attrs.h.inc" h_inc_file ${BIN_TD})
   string(REPLACE ".td" "Attrs.cpp.inc" cpp_inc_file ${BIN_TD})
   mlir_tablegen("${h_inc_file}" -gen-attrdef-decls)
-  mlir_tablegen("${cpp_inc_file}" -gen-attrdef-defs)  
+  mlir_tablegen("${cpp_inc_file}" -gen-attrdef-defs)
 
   add_public_tablegen_target(${target}IncGen)
 
diff --git a/mlir-tensorrt/compiler/CMakeLists.txt b/mlir-tensorrt/compiler/CMakeLists.txt
index d4ede3388..e039c9b8a 100644
--- a/mlir-tensorrt/compiler/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/CMakeLists.txt
@@ -9,6 +9,29 @@ set(MLIR_TENSORRT_COMPILER_INCLUDE_DIRS
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/include"
                     "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
+# We use the MLIR_TENSORRT_LIBS global property to aggregate the list of all
+# declared compiler libraries. This is helpful for simplifying the link
+# dependency declarations for tools that must link "the world" like
+# `mlir-tensorrt-opt`.
+
+# Because MLIR has a level of indirection that lets implementation for
+# interfaces be provided by separate implementation code
+# ("PromisedInterfaces/ExternalModels") which is registered at runtime, it is
+# difficult to capture all dependencies for dialects we require in the compiler
+# purely through target dependency properties. To see what we require from
+# usptream, look at the file `mlir-tensorrt/InitAllDialects.h`. Therefore, we
+# manually enumerate some dependencies here, mainly for providing the functions
+# registering interface external models.
+add_mlir_tensorrt_compiler_dependency(MLIRArithTransforms)
+add_mlir_tensorrt_compiler_dependency(MLIRArithValueBoundsOpInterfaceImpl)
+add_mlir_tensorrt_compiler_dependency(MLIRAsyncDialect)
+add_mlir_tensorrt_compiler_dependency(MLIRBufferizationTransforms)
+add_mlir_tensorrt_compiler_dependency(MLIRControlFlowTransforms)
+add_mlir_tensorrt_compiler_dependency(MLIRNVVMTarget)
+add_mlir_tensorrt_compiler_dependency(MLIRPtrDialect)
+add_mlir_tensorrt_compiler_dependency(MLIRTargetLLVM)
+add_mlir_tensorrt_compiler_dependency(MLIRTensorTransformOps)
+
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(test)
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h
index 264963dcf..72e828b90 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h
@@ -113,6 +113,20 @@ struct DebugOptions : public OptionsProvider {
                      "tree rooted at this directory. Use in conjunction with "
                      "mlir-print-ir-* flags")};
 
+  //===----------------------------------------------------------------------===//
+  // Printing Flags
+  //===----------------------------------------------------------------------===//
+
+  Option<unsigned> elideElementsAttrIfLarger{
+      this->ctx, "mlir-elide-elementsattrs-if-larger",
+      llvm::cl::desc("Elide ElementsAttrs with \"...\" that have "
+                     "more elements than the given upper limit")};
+
+  Option<unsigned> elideResourceStringsIfLarger{
+      this->ctx, "mlir-elide-resource-strings-if-larger",
+      llvm::cl::desc(
+          "Elide printing value of resources if string is too long in chars.")};
+
   //===--------------------------------------------------------------------===//
   // Pass Statistics
   //===--------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt
index 816db9846..3c5812249 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt
@@ -6,9 +6,6 @@ set(_TABLEGEN_ARGS )
 if(MLIR_TRT_ENABLE_HLO)
   list(APPEND _TABLEGEN_ARGS -DMLIR_TENSORRT_ENABLE_HLO)
 endif()
-if(MLIR_TRT_ENABLE_EXECUTOR)
-  list(APPEND _TABLEGEN_ARGS -DMLIR_TENSORRT_ENABLE_EXECUTOR)
-endif()
 
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name MLIRTensorRTConversion ${_TABLEGEN_ARGS})
 add_public_tablegen_target(MLIRTensorRTConversionPassIncGen)
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td
index 19905e537..b7d3d014c 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td
@@ -125,8 +125,6 @@ def ConvertTensorRTToEmitCPass : Pass<"convert-tensorrt-to-emitc",
   let dependentDialects = ["::mlir::emitc::EmitCDialect"];
 }
 
-#ifdef MLIR_TENSORRT_ENABLE_EXECUTOR
-
 //===----------------------------------------------------------------------===//
 // ConvertMemRefToCUDAPass
 //===----------------------------------------------------------------------===//
@@ -312,9 +310,6 @@ def ConvertTensorRTRuntimeToExecutorPass : Pass<"convert-tensorrt-runtime-to-exe
   let options = ConvertToExecutorOptions;
 }
 
-
-#endif // MLIR_TENSORRT_ENABLE_EXECUTOR
-
 #ifdef MLIR_TENSORRT_ENABLE_HLO
 //===----------------------------------------------------------------------===//
 // ConvertStablehloToScfPass
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt
index 494c94a4a..ab5c79a76 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt
@@ -1,10 +1,7 @@
 if(MLIR_TRT_ENABLE_HLO)
   add_subdirectory(StablehloExt)
-  add_subdirectory(Plan)
 endif()
 
+add_subdirectory(CUDA)
+add_subdirectory(Plan)
 add_subdirectory(TensorRTRuntime)
-
-if(MLIR_TRT_ENABLE_EXECUTOR)
-  add_subdirectory(CUDA)
-endif()
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
index a1334f0fd..cf22de6c3 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
@@ -428,6 +428,27 @@ def PostClusteringValidationPass : Pass<"post-clustering-validation", "func::Fun
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PlanAssignMemorySpacesPass
+//===----------------------------------------------------------------------===//
+
+def PlanAssignMemorySpacesPass : Pass<"plan-assign-memory-spaces",
+                                    "::mlir::ModuleOp"> {
+  let summary = "assigns memory spaces encodings to tensor types";
+
+  let description = [{
+    This pass applies a type conversion that adds a '#plan.memory_space'
+    attribute to all tensor types in the top-level module that do not already
+    have an encoding.
+  }];
+
+  let dependentDialects = [
+    "::mlir::plan::PlanDialect",
+    "::mlir::bufferization::BufferizationDialect",
+    "::mlir::tensor::TensorDialect"
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // PlanAllocTensorsPass
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h
new file mode 100644
index 000000000..51850b4ba
--- /dev/null
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h
@@ -0,0 +1,199 @@
+//===- InitAllDialects.h ----------------------------------------*- C++ -*-===//
+//
+// SPDX-FileCopyrightText: Copyright 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Registration methods for MLIR dialects.
+///
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_TENSORRT_INIT_ALL_DIALECTS
+#define MLIR_TENSORRT_INIT_ALL_DIALECTS
+
+#include "mlir-executor/Executor/IR/Executor.h"
+#include "mlir-tensorrt-dialect/TensorRT/IR/TensorRTDialect.h"
+#include "mlir-tensorrt-dialect/TensorRT/Target/TensorRTEncodingImpl.h"
+#include "mlir-tensorrt/Backends/Host/HostBackend.h"
+#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h"
+#include "mlir-tensorrt/Dialect/CUDA/IR/CUDADialect.h"
+#include "mlir-tensorrt/Dialect/CUDA/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h"
+#include "mlir-tensorrt/Dialect/StablehloExt/IR/StableHloExt.h"
+#include "mlir-tensorrt/Dialect/TensorRTRuntime/IR/TensorRTRuntime.h"
+#include "mlir-tensorrt/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir-tensorrt/Features.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/Transforms/BufferViewFlowOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/ControlFlow/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h"
+#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/IR/MemRefMemorySlot.h"
+#include "mlir/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"
+#include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h"
+#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
+#include "mlir/Dialect/PDL/IR/PDL.h"
+#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
+#include "mlir/Dialect/Quant/IR/Quant.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
+#include "mlir/Interfaces/CastInterfaces.h"
+#include "mlir/Target/LLVM/NVVM/Target.h"
+
+#ifdef MLIR_TRT_ENABLE_HLO
+#include "stablehlo/dialect/ChloOps.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo/dialect/VhloOps.h"
+#endif
+
+namespace mlirtrt::compiler {
+
+inline void registerAllDialects(mlir::DialectRegistry &registry) {
+  // clang-format off
+  registry.insert<
+      mlir::affine::AffineDialect,
+      mlir::arith::ArithDialect,
+      mlir::async::AsyncDialect,
+      mlir::bufferization::BufferizationDialect,
+      mlir::cf::ControlFlowDialect,
+      mlir::complex::ComplexDialect,
+      mlir::cuda::CUDADialect,
+      mlir::DLTIDialect,
+      mlir::emitc::EmitCDialect,
+      mlir::executor::ExecutorDialect,
+      mlir::func::FuncDialect,
+      mlir::gpu::GPUDialect,
+      mlir::index::IndexDialect,
+      mlir::linalg::LinalgDialect,
+      mlir::LLVM::LLVMDialect,
+      mlir::math::MathDialect,
+      mlir::memref::MemRefDialect,
+      mlir::NVVM::NVVMDialect,
+      mlir::pdl_interp::PDLInterpDialect,
+      mlir::pdl::PDLDialect,
+      mlir::plan::PlanDialect,
+      mlir::ptr::PtrDialect,
+      mlir::quant::QuantDialect,
+      mlir::scf::SCFDialect,
+      mlir::shape::ShapeDialect,
+      mlir::tensor::TensorDialect,
+      mlir::tensorrt::TensorRTDialect,
+      mlir::transform::TransformDialect,
+      mlir::trtrt::TensorRTRuntimeDialect,
+      mlir::ub::UBDialect,
+      mlir::vector::VectorDialect
+    >();
+  // clang-format on
+
+  IF_MLIR_TRT_ENABLE_HLO({
+    registry.insert<mlir::stablehlo::StablehloDialect>();
+    registry.insert<mlir::chlo::ChloDialect>();
+    registry.insert<mlir::vhlo::VhloDialect>();
+  });
+
+  // Register all external models.
+  mlir::affine::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::arith::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::arith::registerBufferViewFlowOpInterfaceExternalModels(registry);
+  mlir::arith::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
+      registry);
+  mlir::builtin::registerCastOpInterfaceExternalModels(registry);
+  mlir::cf::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  mlir::cf::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::cuda::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::linalg::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
+  mlir::linalg::registerSubsetOpInterfaceExternalModels(registry);
+  mlir::linalg::registerTilingInterfaceExternalModels(registry);
+  mlir::linalg::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::LLVM::registerInlinerInterface(registry);
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+  mlir::memref::registerBufferViewFlowOpInterfaceExternalModels(registry);
+  mlir::memref::registerMemorySlotExternalModels(registry);
+  mlir::memref::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
+  mlir::memref::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::NVVM::registerInlinerInterface(registry);
+  mlir::NVVM::registerNVVMTargetInterfaceExternalModels(registry);
+  mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  mlir::scf::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::scf::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::tensor::registerFindPayloadReplacementOpInterfaceExternalModels(
+      registry);
+  mlir::tensor::registerInferTypeOpInterfaceExternalModels(registry);
+  mlir::tensor::registerSubsetOpInterfaceExternalModels(registry);
+  mlir::tensor::registerTilingInterfaceExternalModels(registry);
+  mlir::tensor::registerValueBoundsOpInterfaceExternalModels(registry);
+  mlir::tensorrt::registerTensorKindOpInterfaceExternalModels(registry);
+  mlir::tensorrt::registerTensorRTEncodingOpInterfaceExternalModels(registry);
+  mlir::trtrt::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::vector::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::vector::registerSubsetOpInterfaceExternalModels(registry);
+  mlir::vector::registerValueBoundsOpInterfaceExternalModels(registry);
+
+  IF_MLIR_TRT_ENABLE_HLO({
+    mlir::stablehlo::registerTensorKindOpInterfaceExternalModels(registry);
+    mlir::stablehlo::registerTypeInferenceExternalModels(registry);
+  });
+}
+
+} // namespace mlirtrt::compiler
+
+#endif // MLIR_TENSORRT_INIT_ALL_DIALECTS
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h
similarity index 81%
rename from mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h
rename to mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h
index e1fec31a0..8668ca659 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h
@@ -18,12 +18,14 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Registration methods for ConvertToLLVMPatternInterface dialect extensions.
+/// Registration methods for dialect extensions.
 ///
 //===----------------------------------------------------------------------===//
 #ifndef MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS
 #define MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS
 
+#include "mlir-tensorrt/Backends/Host/HostBackend.h"
+#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h"
 #include "mlir-tensorrt/Conversion/CUDAToLLVM/CUDAToLLVM.h"
 #include "mlir-tensorrt/Conversion/PlanToLLVM/PlanToLLVM.h"
 #include "mlir-tensorrt/Conversion/TensorRTRuntimeToLLVM/TensorRTRuntimeToLLVM.h"
@@ -37,28 +39,33 @@
 #include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
 
-namespace mlirtrt {
+namespace mlirtrt::compiler {
 
-/// Register all ConvertToLLVMPatternInterface dialect extensions.
-inline void registerConvertToLLVMExtensions(mlir::DialectRegistry &registry) {
-  // Upstream interfaces.
+inline void registerAllExtensions(mlir::DialectRegistry &registry) {
+  // Register all conversion to LLVM interfaces.
   mlir::arith::registerConvertArithToLLVMInterface(registry);
-  mlir::registerConvertComplexToLLVMInterface(registry);
   mlir::cf::registerConvertControlFlowToLLVMInterface(registry);
+  mlir::index::registerConvertIndexToLLVMInterface(registry);
+  mlir::registerConvertComplexToLLVMInterface(registry);
+  mlir::registerConvertCUDAToLLVMPatternInterface(registry);
   mlir::registerConvertFuncToLLVMInterface(registry);
   mlir::registerConvertMathToLLVMInterface(registry);
   mlir::registerConvertMemRefToLLVMInterface(registry);
   mlir::registerConvertNVVMToLLVMInterface(registry);
+  mlir::registerConvertPlanToLLVMPatternInterface(registry);
+  mlir::registerConvertTensorRTRuntimeToLLVMPatternInterface(registry);
   mlir::ub::registerConvertUBToLLVMInterface(registry);
-  mlir::index::registerConvertIndexToLLVMInterface(registry);
   mlir::vector::registerConvertVectorToLLVMInterface(registry);
 
-  // MLIR-TRT interfaces.
-  mlir::registerConvertPlanToLLVMPatternInterface(registry);
-  mlir::registerConvertTensorRTRuntimeToLLVMPatternInterface(registry);
-  mlir::registerConvertCUDAToLLVMPatternInterface(registry);
+  // Inliner extensions.
+  mlir::func::registerInlinerExtension(registry);
+
+  // Plan Extensions.
+  mlir::plan::registerHostBackend(registry);
+  mlir::plan::registerTensorRTBackend(registry);
 }
-} // namespace mlirtrt
+} // namespace mlirtrt::compiler
 
 #endif // MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h
similarity index 68%
rename from mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h
rename to mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h
index a984073b1..8f2d088d4 100644
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h
+++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h
@@ -22,17 +22,20 @@
 #ifndef REGISTRATION_REGISTERMLIRTENSORRTPASSES_H
 #define REGISTRATION_REGISTERMLIRTENSORRTPASSES_H
 
+#include "mlir-executor/InitAllPasses.h"
 #include "mlir-tensorrt-dialect/TensorRT/Transforms/Passes.h"
 #include "mlir-tensorrt/Conversion/Passes.h"
+#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h"
+#include "mlir-tensorrt/Features.h"
 #include "mlir-tensorrt/Transforms/Passes.h"
 #include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/EmitC/Transforms/Passes.h"
 #include "mlir/Transforms/Passes.h"
 
 #ifdef MLIR_TRT_ENABLE_HLO
 #include "mlir-tensorrt/Compiler/StablehloToExecutable/Passes.h"
 #include "mlir-tensorrt/Compiler/StablehloToExecutable/StablehloToExecutable.h"
-#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h"
 #include "mlir-tensorrt/Dialect/StablehloExt/Transforms/Passes.h"
 #include "stablehlo/transforms/Passes.h"
 #include "stablehlo/transforms/optimization/Passes.h"
@@ -42,47 +45,36 @@
 #include "mlir-tensorrt/Compiler/TensorRTToExecutable/Passes.h"
 #endif // MLIR_TRT_TARGET_TENSORRT
 
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-#include "mlir-executor/InitAllPasses.h"
-#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
-#endif // MLIR_TRT_ENABLE_EXECUTOR
-
-namespace mlir {
-namespace tensorrt {
+namespace mlirtrt::compiler {
 
 /// Register passes declared within this repo.
-inline void registerAllMlirTensorRtPasses() {
-  registerMLIRTensorRTConversionPasses();
-  registerTensorRTPasses();
-  registerMLIRTensorRTGenericTransformsPasses();
-  mlir::registerTransformsPasses();
-  mlir::registerConvertPDLToPDLInterp();
+inline void registerAllPasses() {
   mlir::emitc::registerEmitCPasses();
+  mlir::plan::registerPlanDialectPipelines();
+  mlir::plan::registerPlanPasses();
   mlir::registerConvertAffineToStandard();
+  mlir::registerConvertPDLToPDLInterp();
+  mlir::registerMLIRTensorRTConversionPasses();
+  mlir::registerMLIRTensorRTGenericTransformsPasses();
+  mlir::registerTransformsPasses();
+  mlir::tensorrt::registerTensorRTPasses();
+  mlir::registerConvertCUDAToExecutorPass();
+  mlir::bufferization::registerBufferizationPasses();
+  mlir::executor::registerAllPasses();
 
-#ifdef MLIR_TRT_ENABLE_HLO
-  mlirtrt::compiler::registerStablehloToExecutablePasses();
-  mlirtrt::compiler::registerStablehloToExecutablePipelines();
-  mlirtrt::compiler::registerStableHloInputPipelines();
-  stablehlo_ext::registerStableHloExtPasses();
-  stablehlo::registerPasses();
-  stablehlo::registerOptimizationPasses();
-  plan::registerPlanPasses();
-  plan::registerPlanDialectPipelines();
-#endif // MLIR_TRT_ENABLE_HLO
-
-#ifdef MLIR_TRT_TARGET_TENSORRT
-  mlirtrt::compiler::registerTensorRTToExecutablePipelines();
-#endif // MLIR_TRT_TARGET_TENSORRT
+  IF_MLIR_TRT_ENABLE_HLO({
+    mlirtrt::compiler::registerStablehloToExecutablePasses();
+    mlirtrt::compiler::registerStablehloToExecutablePipelines();
+    mlirtrt::compiler::registerStableHloInputPipelines();
+    mlir::stablehlo_ext::registerStableHloExtPasses();
+    mlir::stablehlo::registerPasses();
+    mlir::stablehlo::registerOptimizationPasses();
+  });
 
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-  registerConvertCUDAToExecutorPass();
-  bufferization::registerBufferizationPasses();
-  executor::registerAllPasses();
-#endif // MLIR_TRT_ENABLE_EXECUTOR
+  IF_MLIR_TRT_TARGET_TENSORRT(
+      { mlirtrt::compiler::registerTensorRTToExecutablePipelines(); });
 }
 
-} // namespace tensorrt
-} // namespace mlir
+} // namespace mlirtrt::compiler
 
 #endif // REGISTRATION_REGISTERMLIRTENSORRTPASSES_H
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h
deleted file mode 100644
index 6cbaa10ea..000000000
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h
+++ /dev/null
@@ -1,77 +0,0 @@
-//===- RegisterMlirTensorRtCoreDialects.h -----------------------*- C++ -*-===//
-//
-// SPDX-FileCopyrightText: Copyright 2024 NVIDIA CORPORATION & AFFILIATES.
-// All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// Registration methods for the core dialects defined by this project.
-///
-//===----------------------------------------------------------------------===//
-#include "mlir-tensorrt-dialect/Interface/TensorKindOpInterface.h"
-#include "mlir-tensorrt-dialect/TensorRT/IR/TensorRTDialect.h"
-#include "mlir-tensorrt-dialect/TensorRT/Target/TensorRTEncodingImpl.h"
-#include "mlir-tensorrt/Backends/Host/HostBackend.h"
-#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h"
-#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#ifdef MLIR_TRT_ENABLE_HLO
-#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h"
-#endif // MLIR_TRT_ENABLE_HLO
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-#include "mlir-executor/Executor/IR/Executor.h"
-#include "mlir-tensorrt/Dialect/CUDA/IR/CUDADialect.h"
-#include "mlir-tensorrt/Dialect/CUDA/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir-tensorrt/Dialect/TensorRTRuntime/IR/TensorRTRuntime.h"
-#include "mlir-tensorrt/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/DLTI/DLTI.h"
-#endif // MLIR_TRT_ENABLE_EXECUTOR
-
-namespace mlir {
-class DialectRegistry;
-
-/// Register core MLIR-TensorRT project dialects (dialects defined by this
-/// project and any of their immediate dependencies.
-inline void registerCoreMlirTensorRtDialects(DialectRegistry &registry) {
-  registry.insert<tensorrt::TensorRTDialect, func::FuncDialect>();
-  tensorrt::registerTensorRTEncodingOpInterfaceExternalModels(registry);
-  tensorrt::registerTensorKindOpInterfaceExternalModels(registry);
-  func::registerInlinerExtension(registry);
-
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-  registry.insert<executor::ExecutorDialect, cuda::CUDADialect,
-                  trtrt::TensorRTRuntimeDialect, DLTIDialect>();
-#endif // MLIR_TRT_ENABLE_EXECUTOR
-
-#ifdef MLIR_TRT_ENABLE_HLO
-  registry.insert<plan::PlanDialect>();
-  mlir::plan::registerHostBackend(registry);
-  mlir::plan::registerTensorRTBackend(registry);
-#endif // MLIR_TRT_ENABLE_HLO
-}
-
-inline void
-registerMlirTensorRtBufferizationInterfaces(DialectRegistry &registry) {
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-  trtrt::registerBufferizableOpInterfaceExternalModels(registry);
-  cuda::registerBufferizableOpInterfaceExternalModels(registry);
-#endif // MLIR_TRT_ENABLE_EXECUTOR
-}
-
-inline void registerMlirTensorRtTransformExtensions(DialectRegistry &registry) {
-}
-
-} // namespace mlir
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h
deleted file mode 100644
index 1647a9f5d..000000000
--- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- RegisterMlirTensorRtDialects.h ---------------------------*- C++ -*-===//
-//
-// SPDX-FileCopyrightText: Copyright 2024-2025 NVIDIA CORPORATION & AFFILIATES.
-// All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-// Register all dialects required by parts of this project, including dialects
-// required by transformations or that are accepted by inputs.
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H
-#define MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H
-
-#include "mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h"
-#include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h"
-#include "mlir/Dialect/PDL/IR/PDL.h"
-#include "mlir/Dialect/Quant/IR/Quant.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-
-#ifdef MLIR_TRT_ENABLE_HLO
-#include "mlir-tensorrt/Dialect/StablehloExt/IR/StableHloExt.h"
-#include "stablehlo/dialect/ChloOps.h"
-#include "stablehlo/dialect/StablehloOps.h"
-#include "stablehlo/dialect/VhloOps.h"
-#endif
-
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h"
-#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"
-#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
-#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
-#endif
-
-namespace mlir {
-
-inline void registerAllMlirTensorRtExecutorDialects(DialectRegistry &registry) {
-  // Registration for executor dialect and all upstream dialects that can appear
-  // in the host IR.
-  registry.insert<affine::AffineDialect, memref::MemRefDialect, scf::SCFDialect,
-                  bufferization::BufferizationDialect, math::MathDialect,
-                  emitc::EmitCDialect, LLVM::LLVMDialect>();
-  affine::registerValueBoundsOpInterfaceExternalModels(registry);
-  arith::registerBufferDeallocationOpInterfaceExternalModels(registry);
-  arith::registerBufferizableOpInterfaceExternalModels(registry);
-  bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
-      registry);
-  linalg::registerBufferizableOpInterfaceExternalModels(registry);
-  memref::registerAllocationOpInterfaceExternalModels(registry);
-  scf::registerBufferizableOpInterfaceExternalModels(registry);
-  tensor::registerBufferizableOpInterfaceExternalModels(registry);
-  tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  tensor::registerSubsetOpInterfaceExternalModels(registry);
-  tensor::registerTilingInterfaceExternalModels(registry);
-  LLVM::registerInlinerInterface(registry);
-}
-
-inline void registerAllMlirTensorRtDialects(DialectRegistry &registry) {
-  registerCoreMlirTensorRtDialects(registry);
-  registerMlirTensorRtBufferizationInterfaces(registry);
-  registerMlirTensorRtTransformExtensions(registry);
-
-  // Register other dialects declared in upstream or in dependencies. Only
-  // register dialects if absolutely necessary (i.e. they appear in the input
-  // IR).
-  registry.insert<arith::ArithDialect, pdl::PDLDialect, shape::ShapeDialect,
-                  tensor::TensorDialect, mlir::quant::QuantDialect,
-                  scf::SCFDialect, transform::TransformDialect>();
-
-#ifdef MLIR_TRT_ENABLE_HLO
-  registry.insert<mlir::stablehlo::StablehloDialect, mlir::chlo::ChloDialect,
-                  scf::SCFDialect, vhlo::VhloDialect>();
-  stablehlo::registerTensorKindOpInterfaceExternalModels(registry);
-  stablehlo::registerTypeInferenceExternalModels(registry);
-#endif // MLIR_TRT_ENABLE_HLO
-
-#ifdef MLIR_TRT_ENABLE_EXECUTOR
-  registerAllMlirTensorRtExecutorDialects(registry);
-  tensor::registerValueBoundsOpInterfaceExternalModels(registry);
-#endif // MLIR_TRT_ENABLE_EXECUTOR
-}
-
-} // namespace mlir
-
-#endif // MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H
diff --git a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt
index 3035fcc58..c03ffe79b 100644
--- a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt
@@ -1,3 +1,5 @@
+get_property(MLIR_TENSORRT_LIBS GLOBAL PROPERTY MLIR_TENSORRT_LIBS)
+
 add_mlir_tensorrt_public_c_api_library(MLIRTensorRTCAPIRegisterAllDialects
   RegisterAllDialects.cpp
 
@@ -17,4 +19,5 @@ add_mlir_tensorrt_public_c_api_library(MLIRTensorRTCAPIRegisterAllDialects
     MLIRTensorRTTensorRTRuntimeTransforms
     MLIRTensorRTTensorRTToEmitC
     MLIRTransformDialect
+    ${MLIR_TENSORRT_LIBS}
     )
diff --git a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp
index 6db864b23..9608ca4e0 100644
--- a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp
+++ b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp
@@ -25,19 +25,17 @@
 #include "mlir-tensorrt-c/Compiler/Registration/RegisterAllDialects.h"
 #include "mlir-tensorrt/Compiler/StablehloToExecutable/StablehloToExecutable.h"
 #include "mlir-tensorrt/Compiler/TensorRTToExecutable/TensorRTToExecutable.h"
-#include "mlir-tensorrt/Registration/InitLLVMExtensions.h"
-#include "mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h"
-#include "mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h"
+#include "mlir-tensorrt/InitAllDialects.h"
+#include "mlir-tensorrt/InitAllExtensions.h"
+#include "mlir-tensorrt/InitAllPasses.h"
 #include "mlir/CAPI/IR.h"
 
 void mtrtCompilerRegisterDialects(MlirDialectRegistry registry) {
-  mlir::registerAllMlirTensorRtDialects(*unwrap(registry));
-  mlirtrt::registerConvertToLLVMExtensions(*unwrap(registry));
+  mlirtrt::compiler::registerAllDialects(*unwrap(registry));
+  mlirtrt::compiler::registerAllExtensions(*unwrap(registry));
 }
 
-void mtrtCompilerRegisterPasses() {
-  mlir::tensorrt::registerAllMlirTensorRtPasses();
-}
+void mtrtCompilerRegisterPasses() { mlirtrt::compiler::registerAllPasses(); }
 
 void mtrtCompilerRegisterTasks() {
   mlirtrt::compiler::registerStableHloToExecutableTask();
diff --git a/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp b/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp
index 734367408..497970259 100644
--- a/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp
+++ b/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp
@@ -84,16 +84,23 @@ void DebugOptions::applyToPassManager(PassManager &pm) const {
   if (!shouldPrintBeforePass && !shouldPrintAfterPass)
     return;
 
+  OpPrintingFlags printFlags{};
+  if (this->elideElementsAttrIfLarger > 0)
+    printFlags.elideLargeElementsAttrs(this->elideElementsAttrIfLarger);
+  if (this->elideResourceStringsIfLarger > 0)
+    printFlags.elideLargeResourceString(this->elideResourceStringsIfLarger);
+
   // Otherwise, add the IR printing instrumentation.
   if (!printTreeDir.empty()) {
     pm.enableIRPrintingToFileTree(shouldPrintBeforePass, shouldPrintAfterPass,
                                   printModuleScope, printAfterChange,
-                                  printAfterFailure, printTreeDir);
+                                  printAfterFailure, printTreeDir, printFlags);
     return;
   }
+
   pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass,
                       printModuleScope, printAfterChange, printAfterFailure,
-                      llvm::errs());
+                      llvm::errs(), printFlags);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt
index 62185af6c..8c741f648 100644
--- a/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt
@@ -7,18 +7,15 @@ if(MLIR_TRT_ENABLE_HLO)
   add_subdirectory(ChloToStablehloExt)
 endif()
 
-if(MLIR_TRT_ENABLE_EXECUTOR)
-  add_subdirectory(MemRefToCUDA)
-  add_subdirectory(TensorRTToTensorRTRuntime)
-  add_subdirectory(CUDAToExecutor)
-  add_subdirectory(PlanToExecutor)
-  add_subdirectory(TensorRTRuntimeToExecutor)
-endif()
-
+add_subdirectory(CUDAToExecutor)
 add_subdirectory(CUDAToLLVM)
 add_subdirectory(HostToEmitC)
 add_subdirectory(HostToLLVM)
 add_subdirectory(LLVMCommon)
+add_subdirectory(MemRefToCUDA)
+add_subdirectory(PlanToExecutor)
 add_subdirectory(PlanToLLVM)
+add_subdirectory(TensorRTRuntimeToExecutor)
 add_subdirectory(TensorRTRuntimeToLLVM)
 add_subdirectory(TensorRTToEmitC)
+add_subdirectory(TensorRTToTensorRTRuntime)
diff --git a/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt
index c17ac47a5..8f12204e7 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt
@@ -1,10 +1,7 @@
+add_subdirectory(CUDA)
+add_subdirectory(Plan)
 add_subdirectory(TensorRTRuntime)
 
-if(MLIR_TRT_ENABLE_EXECUTOR)
-  add_subdirectory(CUDA)
-endif()
-
 if(MLIR_TRT_ENABLE_HLO)
-  add_subdirectory(Plan)
   add_subdirectory(StablehloExt)
 endif()
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp
index 3675b1534..96098513c 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp
+++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp
@@ -206,23 +206,20 @@ struct RewriteFromElements : public OpRewritePattern<tensor::FromElementsOp> {
     assert(lattice && !lattice->getValue().isUninitialized());
     TensorKindInfo placementInfo = lattice->getValue();
 
-    MemorySpace originalMemorySpaceConstraint = MemorySpace::host_pinned;
+    std::optional<MemorySpace> originalMemorySpace{};
     if (auto constraint =
-            dyn_cast_or_null<MemorySpaceAttr>(op.getType().getEncoding())) {
-      // A pre-specified 'device' constraint is not allowed.
-      if (constraint.getValue() != MemorySpace::host &&
-          constraint.getValue() != MemorySpace::host_pinned)
-        return failure();
-      originalMemorySpaceConstraint = constraint.getValue();
-    }
+            dyn_cast_or_null<MemorySpaceAttr>(op.getType().getEncoding()))
+      originalMemorySpace = constraint.getValue();
 
     // Create a host allocation and insert the elements.
+    MemorySpace memorySpace = MemorySpace::host_pinned;
     Value hostReplacement = createTensorFromElements(
-        rewriter, op.getLoc(), op.getType(), op.getElements(),
-        originalMemorySpaceConstraint);
+        rewriter, op.getLoc(), op.getType(), op.getElements(), memorySpace);
     Value hostReplacementCasted =
         rewriter.create<tensor::CastOp>(loc, originalType, hostReplacement);
-    if (placementInfo.isHostOnly()) {
+    bool canOptimizeHostReplacement =
+        !originalMemorySpace || (*originalMemorySpace == memorySpace);
+    if (placementInfo.isHostOnly() && canOptimizeHostReplacement) {
       rewriter.replaceOp(op, hostReplacementCasted);
       return success();
     }
@@ -242,14 +239,17 @@ struct RewriteFromElements : public OpRewritePattern<tensor::FromElementsOp> {
             .getResult();
     devReplacement =
         rewriter.create<tensor::CastOp>(loc, originalType, devReplacement);
-    rewriter.replaceOpUsesWithIf(
-        op, hostReplacementCasted, [&](OpOperand &use) {
-          return TensorKindAnalysis::getStaticOperandTensorKind(use) ==
-                 TensorKind::Host;
-        });
+
+    if (canOptimizeHostReplacement)
+      rewriter.replaceOpUsesWithIf(
+          op, hostReplacementCasted, [&](OpOperand &use) {
+            return TensorKindAnalysis::getStaticOperandTensorKind(use) ==
+                   TensorKind::Host;
+          });
     rewriter.replaceOpUsesWithIf(op, devReplacement, [&](OpOperand &use) {
-      return TensorKindAnalysis::getStaticOperandTensorKind(use) !=
-             TensorKind::Host;
+      return !canOptimizeHostReplacement ||
+             TensorKindAnalysis::getStaticOperandTensorKind(use) !=
+                 TensorKind::Host;
     });
     return success();
   }
@@ -810,6 +810,7 @@ static LogicalResult rewriteFuncToDestinationPassingStyle(
     // value.
     bufferization::TraversalConfig config;
     config.followEquivalentOnly = true;
+    config.followInPlaceOnly = true;
     config.alwaysIncludeLeaves = true;
     SetVector<Value> equivalentValues = state.findValueInReverseUseDefChain(
         &v, /*condition=*/
@@ -962,6 +963,10 @@ static LogicalResult enforceFunctionCallingStylePolicy(
     return failure();
 
   for (func::FuncOp func : orderedFuncOps) {
+    LLVM_DEBUG(DBGS() << "encountered func " << func.getName() << "\n");
+    if (func.isDeclaration())
+      continue;
+
     // All functions should be single-block at this point.
     if (func.getBlocks().size() != 1)
       return failure();
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp
new file mode 100644
index 000000000..b20e39c77
--- /dev/null
+++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp
@@ -0,0 +1,399 @@
+//===- AssignMemorySpaces.cpp ---------------------------------------------===//
+//
+// SPDX-FileCopyrightText: Copyright 2024-2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+///  Implementation of the `plan-assign-memory-spaces` pass.
+///
+//===----------------------------------------------------------------------===//
+#include "mlir-tensorrt-dialect/Analysis/TensorKindAnalysis.h"
+#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h"
+#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h"
+#include "mlir-tensorrt/Utils/ModuleUtils.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "plan-assign-memory-spaces"
+
+namespace mlir::plan {
+#define GEN_PASS_DEF_PLANASSIGNMEMORYSPACESPASS
+#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h.inc"
+} // namespace mlir::plan
+
+using namespace mlir;
+using namespace mlir::plan;
+
+namespace {
+
+// Generic pattern that rewrites any op by rewriting its operands and result
+// types. Regions are also rewritten.
+class GenericConvertSpace : public ConversionPattern {
+public:
+  GenericConvertSpace(TypeConverter &typeConverter, MLIRContext *context)
+      : ConversionPattern(typeConverter, MatchAnyOpTypeTag{}, 0, context) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    SmallVector<Type> resultTypes;
+    if (failed(typeConverter->convertTypes(op->getResultTypes(), resultTypes)))
+      return failure();
+
+    auto *newOp = Operation::create(
+        op->getLoc(), op->getName(), resultTypes, operands, op->getAttrs(),
+        op->getPropertiesStorage(), op->getSuccessors(), op->getNumRegions());
+    for (auto regions : llvm::zip(op->getRegions(), newOp->getRegions())) {
+      Region &before = std::get<0>(regions);
+      Region &parent = std::get<1>(regions);
+      rewriter.inlineRegionBefore(before, parent, parent.end());
+      if (failed(rewriter.convertRegionTypes(&parent, *typeConverter)))
+        return failure();
+    }
+    rewriter.insert(newOp);
+    rewriter.replaceOp(op, newOp->getResults());
+    return success();
+  }
+};
+
+// A pattern that converts the type of the attribute used as an operand for
+// arith.constant
+class ConvertConstantPattern : public OpConversionPattern<arith::ConstantOp> {
+public:
+  ConvertConstantPattern(TypeConverter &typeConverter, MLIRContext *context)
+      : OpConversionPattern<arith::ConstantOp>(typeConverter, context) {}
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto newType = dyn_cast_if_present<ShapedType>(
+        typeConverter->convertType(op.getType()));
+    if (!newType)
+      return failure();
+
+    ElementsAttr newAttr{};
+    if (auto elementsAttr = dyn_cast<DenseElementsAttr>(op.getValue()))
+      newAttr = elementsAttr.reshape(newType);
+    if (auto resourceAttr =
+            dyn_cast<DenseResourceElementsAttr>(op.getValue())) {
+      DenseResourceElementsHandle handle = resourceAttr.getRawHandle();
+      newAttr = DenseResourceElementsAttr::get(newType, handle);
+    }
+    if (!newAttr)
+      return failure();
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newAttr);
+    return success();
+  }
+};
+} // namespace
+
+/// Return true if the op is likely in a compute region, like the region of
+/// `stablehlo.reduce` or `linalg.generic`.
+static bool inComputeRegion(Operation *op) {
+  Operation *parent = op->getParentOp();
+  while (parent) {
+    if (isa<func::FuncOp>(parent))
+      return false;
+    if (!isa<RegionBranchOpInterface>(parent))
+      return true;
+    parent = parent->getParentOp();
+  }
+  return false;
+}
+
+namespace {
+/// Use an explicit 'host_pinned' staging tensor to materialie the
+/// 'from_elements' before creating explicitly moving it to the 'device' space.
+/// Other optimization patterns below help avoid the host-device transfer when
+/// possible.
+struct FixUpFromElements : public OpRewritePattern<tensor::FromElementsOp> {
+  FixUpFromElements(MLIRContext *ctx, const DataFlowSolver &solver,
+                    PatternBenefit benefit = 1)
+      : OpRewritePattern(ctx, benefit), solver(solver) {}
+
+  LogicalResult matchAndRewrite(tensor::FromElementsOp op,
+                                PatternRewriter &rewriter) const override {
+    auto space = dyn_cast_or_null<MemorySpaceAttr>(op.getType().getEncoding());
+    if (!space)
+      return failure();
+    if (space.getValue() != plan::MemorySpace::device)
+      return failure();
+
+    const TensorKindLattice *lattice =
+        solver.lookupState<TensorKindLattice>(op.getResult());
+    if (!lattice || lattice->getValue().isUninitialized() ||
+        !lattice->getValue().isHostVisible())
+      return failure();
+
+    RankedTensorType originalType = op.getType();
+    RankedTensorType newType = RankedTensorType::get(
+        originalType.getShape(), originalType.getElementType(),
+        MemorySpaceAttr::get(originalType.getContext(),
+                             plan::MemorySpace::host_pinned));
+    auto newOp = rewriter.create<tensor::FromElementsOp>(op.getLoc(), newType,
+                                                         op.getElements());
+    Value deviceTensor = rewriter.create<tensor::EmptyOp>(
+        op.getLoc(), originalType.getShape(), originalType.getElementType(),
+        originalType.getEncoding());
+    Value rematDevReplacement =
+        rewriter
+            .create<bufferization::MaterializeInDestinationOp>(
+                op.getLoc(), originalType, newOp.getResult(), deviceTensor)
+            .getResult();
+    rewriter.replaceOp(op, rematDevReplacement);
+    return success();
+  }
+
+  const DataFlowSolver &solver;
+};
+
+static bool isHostVisible(TypedValue<RankedTensorType> v) {
+  auto space = dyn_cast_or_null<MemorySpaceAttr>(v.getType().getEncoding());
+  if (!space)
+    return false;
+  switch (space.getValue()) {
+  case plan::MemorySpace::host:
+  case plan::MemorySpace::host_pinned:
+  case plan::MemorySpace::unified:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// For any 'shape' parameter of a 'tensor.reshape', get the shape by skipping
+/// past any unnecessary explicit host-device transfers.
+struct ReshapeAbsorbDeviceCast : public OpRewritePattern<tensor::ReshapeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::ReshapeOp op,
+                                PatternRewriter &rewriter) const override {
+    if (isHostVisible(op.getShape()))
+      return failure();
+    auto matOp =
+        op.getShape()
+            .getDefiningOp<bufferization::MaterializeInDestinationOp>();
+    if (!matOp)
+      return failure();
+    auto source = dyn_cast<TypedValue<RankedTensorType>>(matOp.getSource());
+    if (!source || !isHostVisible(source))
+      return failure();
+    rewriter.modifyOpInPlace(op,
+                             [&]() { op.getShapeMutable().assign(source); });
+    return success();
+  }
+};
+
+/// Rewrite `memref.load` that acts on device memory to first copy the buffer to
+/// the host and load from the host buffer.
+struct TensorDeviceExtractRewriter
+    : public OpRewritePattern<tensor::ExtractOp> {
+
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp op,
+                                PatternRewriter &rewriter) const override {
+    auto source = op.getTensor();
+    if (isHostVisible(source))
+      return failure();
+
+    if (inComputeRegion(op))
+      return failure();
+
+    rewriter.setInsertionPointAfterValue(source);
+    Value hostTensor = rewriter.create<tensor::CastOp>(
+        op.getLoc(),
+        RankedTensorType::get(
+            source.getType().getShape(), source.getType().getElementType(),
+            plan::MemorySpaceAttr::get(op->getContext(),
+                                       plan::MemorySpace::host_pinned)),
+        source);
+
+    rewriter.replaceUsesWithIf(op.getTensor(), hostTensor, [&](OpOperand &use) {
+      return isa<tensor::ExtractOp>(use.getOwner());
+    });
+
+    return success();
+  }
+};
+
+/// Remap relevant analysis state of type T from `original` to `replacement`.
+template <typename T>
+static void remapLatticeState(DataFlowSolver &solver, Value original,
+                              Value replacement) {
+  if constexpr (!std::is_same_v<T, dataflow::Executable>) {
+    if (const T *lattice = solver.lookupState<T>(original)) {
+      T *latticeReplacement = solver.getOrCreateState<T>(replacement);
+      latticeReplacement->getValue() = lattice->getValue();
+    }
+  } else {
+    // do nothing for liveness analysis for the moment except create the state
+    if (const auto *oldState =
+            solver.lookupState<dataflow::Executable>(original)) {
+      dataflow::Executable *newState = solver.getOrCreateState<T>(replacement);
+      // Set to live if old state is live. We ignore change status.
+      if (oldState->isLive())
+        (void)newState->setToLive();
+    }
+  }
+}
+
+/// A rewrite listener that transfers replacements to updates to the solver
+/// state.
+class SolverStateListener : public RewriterBase::Listener {
+public:
+  SolverStateListener(DataFlowSolver &solver)
+      : RewriterBase::Listener(), solver(solver) {}
+
+private:
+  void notifyOperationReplaced(Operation *op,
+                               ValueRange replacements) override {
+    for (auto [original, replacement] :
+         llvm::zip_equal(op->getResults(), replacements)) {
+      remapLatticeState<TensorKindLattice>(solver, original, replacement);
+      remapLatticeState<dataflow::Lattice<dataflow::ConstantValue>>(
+          solver, original, replacement);
+      remapLatticeState<dataflow::Executable>(solver, original, replacement);
+    }
+    solver.eraseState(solver.getProgramPointAfter(op));
+  }
+  void notifyOperationReplaced(Operation *op, Operation *replacement) override {
+    notifyOperationReplaced(op, replacement->getResults());
+  }
+
+  void notifyOperationErased(Operation *op) override {
+    solver.eraseState(solver.getProgramPointAfter(op));
+    for (Value res : op->getResults())
+      solver.eraseState(res);
+  }
+
+  DataFlowSolver &solver;
+};
+
+} // namespace
+
+namespace {
+struct AssignMemorySpacesPass
+    : public plan::impl::PlanAssignMemorySpacesPassBase<
+          AssignMemorySpacesPass> {
+  void runOnOperation() override {
+
+    MLIRContext *context = &getContext();
+    ConversionTarget target(*context);
+
+    TypeConverter converter;
+    converter.addConversion(
+        [&](Type type) -> std::optional<Type> { return type; });
+
+    // The default tensor type converter just adds the 'device' memory type
+    // info.
+    auto deviceEncoding =
+        plan::MemorySpaceAttr::get(context, plan::MemorySpace::device);
+    converter.addConversion([&](RankedTensorType type) -> std::optional<Type> {
+      if (type.getEncoding())
+        return type;
+      return RankedTensorType::get(type.getShape(), type.getElementType(),
+                                   deviceEncoding);
+    });
+
+    // Ops are legal if they are in a nested module or if their operand and
+    // result types are legal.
+    target.markUnknownOpDynamicallyLegal([&](Operation *op) {
+      if (op->getParentWithTrait<OpTrait::SymbolTable>() != getOperation())
+        return true;
+      return converter.isLegal(op->getOperandTypes()) &&
+             converter.isLegal(op->getResultTypes());
+    });
+    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+      if (op->getParentWithTrait<OpTrait::SymbolTable>() != getOperation())
+        return true;
+      return converter.isSignatureLegal(op.getFunctionType());
+    });
+    target.markOpRecursivelyLegal<func::FuncOp>(
+        [&](func::FuncOp op) -> std::optional<bool> {
+          if (op->getParentWithTrait<OpTrait::SymbolTable>() != getOperation())
+            return true;
+          return false;
+        });
+    target.addDynamicallyLegalOp<arith::ConstantOp>([&](arith::ConstantOp op) {
+      if (op->getParentWithTrait<OpTrait::SymbolTable>() != getOperation())
+        return true;
+      return converter.isLegal(op.getType()) &&
+             converter.isLegal(op.getValue().getType());
+    });
+
+    RewritePatternSet patterns(&getContext());
+    patterns.add<GenericConvertSpace, ConvertConstantPattern>(converter,
+                                                              context);
+
+    // FuncOp is special as it has type encoding via attributes.
+    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
+                                                                   converter);
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+
+    auto module = getOperation();
+    if (failed(applyFullConversion(module, target, std::move(patterns)))) {
+      emitError(module.getLoc(), "failed to assign memory spaces");
+      return signalPassFailure();
+    }
+
+    // Perform some minor optimizations involving tensor.from_elements.
+    {
+      SymbolTableCollection symbolTables;
+      DataFlowSolver solver(DataFlowConfig().setInterprocedural(false));
+      solver.load<dataflow::DeadCodeAnalysis>();
+      solver.load<dataflow::SparseConstantPropagation>();
+      solver.load<TensorKindAnalysis>(symbolTables);
+
+      if (failed(solver.initializeAndRun(getOperation()))) {
+        emitError(getOperation().getLoc())
+            << "failed to run TensorKindAnalysis";
+        return signalPassFailure();
+      }
+
+      SolverStateListener solverAwareListener(solver);
+      GreedyRewriteConfig config;
+      config.listener = &solverAwareListener;
+      FrozenRewritePatternSet patterns = [&]() {
+        RewritePatternSet patterns_(&getContext());
+        patterns_.insert<FixUpFromElements>(&getContext(), solver);
+        patterns_.insert<ReshapeAbsorbDeviceCast>(&getContext());
+        patterns_.insert<TensorDeviceExtractRewriter>(&getContext());
+        return patterns_;
+      }();
+      for (FunctionOpInterface func :
+           getOperation().getOps<FunctionOpInterface>()) {
+        if (failed(applyPatternsGreedily(func, patterns))) {
+          emitError(func.getLoc()) << "failed to run " << getArgument();
+          return signalPassFailure();
+        }
+      }
+    }
+  }
+};
+} // namespace
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt
index a2953e40d..be4d4ffc3 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_tensorrt_library(MLIRTensorRTPlanTransforms
   AllocTensors.cpp
+  AssignMemorySpaces.cpp
   Bufferize.cpp
   Clustering.cpp
   CreateClosedRegions.cpp
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp
index 34ff09d1e..2b19ca948 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp
+++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
 #include "mlir/IR/SymbolTable.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
@@ -327,6 +328,36 @@ static LogicalResult insertTensorCopiesInModule(
   return insertTensorCopiesWithinModuleScope(module, state);
 }
 
+/// The memref.global operation rejects encodings on the type of the
+/// ElementsAttr. Drop them here.
+/// TODO: fix upstream bufferization to handle this.
+static void fixupMemrefGlobalInitialValueTypes(ModuleLikeOp moduleOp) {
+  for (memref::GlobalOp global : moduleOp.getOps<memref::GlobalOp>()) {
+    ElementsAttr initialValue =
+        llvm::dyn_cast_or_null<ElementsAttr>(global.getInitialValueAttr());
+    if (!initialValue)
+      continue;
+    // Drop the encoding if present.
+    if (auto tensorType = dyn_cast<RankedTensorType>(initialValue.getType())) {
+      if (auto encoding = tensorType.getEncoding()) {
+        tensorType = RankedTensorType::get(tensorType.getShape(),
+                                           tensorType.getElementType());
+        if (auto elementsAttr = dyn_cast<DenseElementsAttr>(initialValue)) {
+          initialValue = elementsAttr.reshape(tensorType);
+          global.setInitialValueAttr(initialValue);
+          continue;
+        }
+        if (auto resourceAttr =
+                dyn_cast<DenseResourceElementsAttr>(initialValue)) {
+          DenseResourceElementsHandle handle = resourceAttr.getRawHandle();
+          initialValue = DenseResourceElementsAttr::get(tensorType, handle);
+          global.setInitialValueAttr(initialValue);
+          continue;
+        }
+      }
+    }
+  }
+}
 static LogicalResult
 bufferizeOneModule(ModuleLikeOp moduleOp,
                    const bufferization::OneShotBufferizationOptions &options,
@@ -371,6 +402,10 @@ bufferizeOneModule(ModuleLikeOp moduleOp,
     return success();
   if (failed(bufferizeOneModuleLikeOp(moduleOp, options, statistics)))
     return failure();
+
+  // Fixup any globals which have incorect encodings on the initial value type.
+  fixupMemrefGlobalInitialValueTypes(moduleOp);
+
   return success();
 }
 
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp
index 6efa7f6d7..418833088 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp
+++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp
@@ -62,6 +62,7 @@ static void buildPlanOneShotBufferizePipelinePipeline(
     OpPassManager &pm, const plan::PlanAllocTensorsPassOptions &opts) {
   pm.addPass(createInlinerPass());
   pm.addPass(bufferization::createEmptyTensorEliminationPass());
+  pm.addPass(plan::createPlanAssignMemorySpacesPass());
   pm.addPass(plan::createPlanAllocTensorsPass(opts));
   pm.addPass(plan::createPlanModuleBufferizePass());
   pm.addPass(mlir::createMemRefCastEliminationPass());
diff --git a/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp
index 4adb541ee..c1a2d10d7 100644
--- a/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -85,7 +85,7 @@ struct EnqueueOpInterface
   bool bufferizesToElementwiseAccess(Operation *op,
                                      const bufferization::AnalysisState &state,
                                      ArrayRef<OpOperand *> opOperands) const {
-    return true;
+    return false;
   }
 
   /// Bufferize the `trtrt.enqueue` operation.
diff --git a/mlir-tensorrt/compiler/test/CMakeLists.txt b/mlir-tensorrt/compiler/test/CMakeLists.txt
index 4f3c7294d..b0ec5e0ce 100644
--- a/mlir-tensorrt/compiler/test/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/test/CMakeLists.txt
@@ -22,6 +22,7 @@ configure_lit_site_cfg(
 set(MLIR_TENSORRT_TEST_DEPENDS_
   FileCheck count not
   mlir-tensorrt-opt
+  mlir-tensorrt-runner
   mlir-tensorrt-translate
   ${MLIR_TENSORRT_TEST_LIBS}
 )
@@ -32,10 +33,6 @@ if(MLIR_TRT_TARGET_TENSORRT AND
     TensorRTTestPlugins)
 endif()
 
-if(MLIR_TRT_TARGET_LUA)
-  list(APPEND MLIR_TENSORRT_TEST_DEPENDS_ mlir-tensorrt-runner)
-endif()
-
 if(MLIR_TRT_ENABLE_PYTHON)
   list(APPEND MLIR_TENSORRT_TEST_DEPENDS_
 
diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir
new file mode 100644
index 000000000..f87590198
--- /dev/null
+++ b/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-tensorrt-opt %s -split-input-file --plan-assign-memory-spaces -canonicalize | FileCheck %s
+
+
+func.func private @cond() -> i1
+
+// CHECK-LABEL: func.func @scf_while_loop_2
+// CHECK: scf.while {{.*}}tensor<1xf32, #plan.memory_space<device>>) -> tensor<1xf32, #plan.memory_space<device>>
+func.func @scf_while_loop_2(%arg0: f32) -> f32 {
+  %c0 = arith.constant 0 : index
+  %1 = tensor.from_elements %arg0  : tensor<1xf32>
+  %2 = scf.while (%arg1 = %1) : (tensor<1xf32>) -> tensor<1xf32> {
+    %cond = func.call @cond() : () -> i1
+    %e = tensor.extract %arg1[%c0] : tensor<1xf32>
+    %f = arith.addf %e, %e : f32
+    %3 = tensor.from_elements %f : tensor<1xf32>
+    scf.condition(%cond) %3 : tensor<1xf32>
+  } do {
+  ^bb0(%arg1: tensor<1xf32>):
+    %extract = tensor.extract %arg1[%c0] : tensor<1xf32>
+    %3 = arith.addf %extract, %extract : f32
+    %4 = tensor.from_elements %3 : tensor<1xf32>
+    scf.yield %4 : tensor<1xf32>
+  }
+  %3 = tensor.extract %2[%c0] : tensor<1xf32>
+  return %3 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @arith_constant
+// CHECK: arith.constant {{.*}} : tensor<2xf32, #plan.memory_space<device>>
+// CHECK: arith.constant {{.*}} : tensor<2xf32, #plan.memory_space<device>>
+func.func @arith_constant() -> (tensor<2xf32>, tensor<2xf32>) {
+  %0 = arith.constant dense<[0.1, 0.2]> : tensor<2xf32>
+  %1 = arith.constant dense_resource<__elided__> : tensor<2xf32>
+  return %0, %1 : tensor<2xf32>, tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: module @nested_module
+// CHECK-NOT: #plan.memory_space
+module @outer {
+module @nested_module {
+  func.func @nested_func() -> tensor<2xf32> {
+    %0 = arith.constant dense<[0.1, 0.2]> : tensor<2xf32>
+    return %0 : tensor<2xf32>
+  }
+}
+}
+
+// -----
+
+// CHECK-LABEL: func.func @existing_constraint_1
+// CHECK: tensor.extract {{.*}}<host>
+func.func @existing_constraint_1(%arg0: tensor<2xf32, #plan.memory_space<host>>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.extract %arg0[%c0] : tensor<2xf32, #plan.memory_space<host>>
+  return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @existing_constraint_2
+// CHECK-NOT: tensor.cast
+// CHECK: tensor.extract {{.*}}<host>
+func.func @existing_constraint_2(%arg0: tensor<2xf32, #plan.memory_space<host>>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %1 = tensor.cast %arg0 : tensor<2xf32, #plan.memory_space<host>> to tensor<2xf32>
+  %0 = tensor.extract %1[%c0] : tensor<2xf32>
+  return %0 : f32
+}
+
diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir
index f3d137e88..e70fc0011 100644
--- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir
+++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir
@@ -280,11 +280,11 @@ func.func @test_dps_chain_repeat(%arg0: tensor<10xf32>) -> (tensor<10xf32>, tens
 // CHECK-LABEL: @test_dps_chain_repeat
 //  CHECK-SAME: (%[[arg0:.+]]: tensor<10xf32>, %[[arg1:.+]]: tensor<10xf32> {plan.result_arg}, %[[arg2:.+]]: tensor<10xf32> {plan.result_arg}, %[[arg3:.+]]: tensor<10xf32> {plan.result_arg}) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 //   CHECK-NOT: bufferization.alloc_tensor()
-//       CHECK: %[[v0:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[arg1]] : tensor<10xf32>)
+//       CHECK: %[[v0:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[arg2]] : tensor<10xf32>)
 //       CHECK: %[[v1:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[v0]] : tensor<10xf32>)
-//       CHECK: %[[v2:.+]] = bufferization.materialize_in_destination %[[v0]] in %[[arg2]] :
+//       CHECK: %[[v2:.+]] = bufferization.materialize_in_destination %[[v1]] in %[[arg1]] :
 //  CHECK-NEXT: %[[v3:.+]] = bufferization.materialize_in_destination %[[v0]] in %[[arg3]] :
-//  CHECK-NEXT: return %[[v1]], %[[v2]], %[[v3]] : tensor<10xf32>, tensor<10xf32>, tensor<10xf32>
+//  CHECK-NEXT: return %[[v2]], %[[v0]], %[[v3]] : tensor<10xf32>, tensor<10xf32>, tensor<10xf32>
 
 // CHECK-ALLOC-LABEL: @test_dps_chain_repeat
 //  CHECK-ALLOC-SAME: (%[[arg0:.+]]: tensor<10xf32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
@@ -767,7 +767,7 @@ func.func @test_dps_complex_reshape_collapse_equivalent(
 // CHECK-LABEL: func.func @test_dps_complex_reshape_collapse_equivalent
 //  CHECK-SAME: (%[[arg0:.+]]: tensor<2x3xcomplex<f32>, #plan.memory_space<device>>, %[[arg1:.+]]: tensor<2x3xcomplex<f32>, #plan.memory_space<device>>, %[[arg2:.+]]: tensor<6xcomplex<f32>, #plan.memory_space<device>> {plan.result_arg})
 //   CHECK-DAG:     %[[expanded:.+]] = tensor.expand_shape %[[arg2]] {{\[}}[0, 1]] output_shape [2, 3] :
-//   CHECK-DAG:     %[[mapped:.+]] = linalg.map { complex.add } ins(%[[arg0]], %[[arg1]] : 
+//   CHECK-DAG:     %[[mapped:.+]] = linalg.map { complex.add } ins(%[[arg0]], %[[arg1]] :
 //   CHECK-DAG:     %[[collapsed:.+]] = tensor.collapse_shape %[[mapped]]
 //   CHECK-DAG:     return %[[collapsed]]
 
@@ -847,8 +847,8 @@ func.func @test_dps_bitcast_not_equivalent(
 
 // CHECK-LABEL: func.func @test_dps_bitcast_not_equivalent
 //  CHECK-SAME: (%[[arg0:.+]]: tensor<2xi32, #plan.memory_space<device>>, %[[arg1:.+]]: tensor<2xi32, #plan.memory_space<device>>, %[[arg2:.+]]: tensor<2xf32, #plan.memory_space<device>> {plan.result_arg})
-//   CHECK-DAG:     %[[v0:.+]] = bufferization.alloc_tensor() 
+//   CHECK-DAG:     %[[v0:.+]] = bufferization.alloc_tensor()
 //   CHECK-DAG:     %[[mapped:.+]] = linalg.map {{.*}} ins(%[[arg0]], %[[arg1]] : {{.*}}) outs(%[[v0]] : {{.*}})
-//   CHECK-DAG:     %[[v1:.+]] = tensor.bitcast %[[mapped]] 
+//   CHECK-DAG:     %[[v1:.+]] = tensor.bitcast %[[mapped]]
 //   CHECK-DAG:     %[[v2:.+]] = bufferization.materialize_in_destination %[[v1]] in %[[arg2]]
 //   CHECK-DAG:     return %[[v2]]
diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir
index c206b6c71..baf0a8c8f 100644
--- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir
+++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir
@@ -83,3 +83,47 @@ func.func @small_host_and_device_tensor_constant(%arg0: tensor<?x?xf32>) -> (ten
 //       CHECK:     memref.dealloc %[[alloc]] : memref<4xindex, #plan.memory_space<host>>
 //       CHECK:     memref.dealloc %[[alloc_0]] : memref<4xindex, #plan.memory_space<host>>
 //       CHECK:     return
+
+// -----
+
+func.func private @cond() -> i1
+
+// The test case illustrates a while loop that for whatever reason may not
+// have been "detensorized" earlier in the pipeline. The TensorKindAnalysis
+// will show that all tensors are "host-only", but currently bufferization 
+// does not deduce this via its memory space inference logic. Therefore, the
+// loop will be bufferized so that the buffers are in the device
+// space at branch points, which means lots of copies are inserted. Before
+// adding the 'plan-assign-memory-spaces' pass, we would get a failure here 
+// due to mixed types of init arg and yielded value inferred by bufferization.
+// In the future, we can optimize this case by adding support for rewriting  
+// the encoding attribute of loop-carried tensors to be host for this case.
+
+func.func @while_loop_host_tensor_carried(%arg0: f32) -> f32 {
+  %c0 = arith.constant 0 : index
+  %1 = tensor.from_elements %arg0  : tensor<1xf32>
+  %2 = scf.while (%arg1 = %1) : (tensor<1xf32>) -> tensor<1xf32> {     
+    %cond = func.call @cond() : () -> i1
+    %e = tensor.extract %arg1[%c0] : tensor<1xf32>
+    %f = arith.addf %e, %e : f32
+    %3 = tensor.from_elements %f : tensor<1xf32>
+    scf.condition(%cond) %3 : tensor<1xf32>
+  } do {
+  ^bb0(%arg1: tensor<1xf32>):        
+    %extract = tensor.extract %arg1[%c0] : tensor<1xf32>  
+    %3 = arith.addf %extract, %extract : f32
+    %4 = tensor.from_elements %3 : tensor<1xf32>    
+    scf.yield %4 : tensor<1xf32>
+  }
+  %3 = tensor.extract %2[%c0] : tensor<1xf32>
+  return %3 : f32
+}
+
+// CHECK-LABEL: func.func @while_loop_host_tensor_carried
+//         CHECK:     scf.while : () -> ()
+// CHECK-COUNT-2:       memref.copy
+//         CHECK:       scf.condition
+// CHECK-COUNT-2:       memref.copy
+//         CHECK:       scf.yield
+// CHECK-COUNT-1:       memref.copy
+//     CHECK-NOT:     memref.copy
diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir
index c44b7a68b..f623b7cba 100644
--- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir
+++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir
@@ -43,28 +43,39 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<1xf32> {
 
 //       CHECK:   memref.global "private" constant @__constant_1xf32
 //       CHECK:   memref.global "private" constant @__constant_1xi32
-// CHECK-LABEL: @main
-//  CHECK-SAME: (%[[arg0:.+]]: memref<10xf32, #plan.memory_space<device>>) -> memref<1xf32, #plan.memory_space<device>>
-//       CHECK:     %[[v0:.+]] = memref.get_global @__constant_1xi32
-//       CHECK:     %[[v1:.+]] = memref.get_global @__constant_1xf32
-//       CHECK:     %[[v2:.+]] = cuda.stream.create : !cuda.stream
-//       CHECK:     %[[alloc:.+]] = memref.alloc()
-//       CHECK:     %[[v3:.+]] = trtrt.get_function @trt_while_loop_region
-//       CHECK:     %[[v4:.+]] = trtrt.get_function @trt_while_loop_region_0
-//       CHECK:     %[[alloc_0:.+]] = memref.alloc()
-//       CHECK:     memref.copy %[[v0]], %[[alloc_0]]
-//       CHECK:     %[[alloc_1:.+]] = memref.alloc()
-//       CHECK:     memref.copy %[[v1]], %[[alloc_1]]
-//       CHECK:     %[[v5:.+]]:2 = scf.while (%[[arg1:.+]] = %[[alloc_0]], %[[arg2:.+]] = %[[alloc_1]])
-//       CHECK:       trtrt.enqueue %[[v3]] stream(%[[v2]]) (%[[arg1]]) outs(%[[alloc]])
-//       CHECK:       %[[alloc_2:.+]] = memref.alloc() {{.*}} : memref<i1, #plan.memory_space<host_pinned>>
-//       CHECK:       memref.copy %[[alloc]], %[[alloc_2]] : memref<i1, #plan.memory_space<device>> to memref<i1, #plan.memory_space<host_pinned>>
-//       CHECK:       %[[v6:.+]] = memref.load %[[alloc_2]][] : memref<i1, #plan.memory_space<host_pinned>>
-//       CHECK:       scf.condition(%[[v6]]) %[[arg1]], %[[arg2]]
+// CHECK-LABEL: func.func @main
+//  CHECK-SAME: (%[[arg0:.+]]: memref<10xf32, #plan.memory_space<device>>) 
+//   CHECK-DAG:     %[[v0:.+]] = memref.get_global @__constant_1xi32 : memref<1xi32, #plan.memory_space<device>>
+//   CHECK-DAG:     %[[v1:.+]] = memref.get_global @__constant_1xf32 : memref<1xf32, #plan.memory_space<device>>
+//   CHECK-DAG:     %[[v2:.+]] = cuda.stream.create : !cuda.stream
+//   CHECK-DAG:     %[[alloc:.+]] = memref.alloc() {alignment = 16 : i64} : memref<i1, #plan.memory_space<device>>
+//   CHECK-DAG:     %[[v3:.+]] = trtrt.get_function @trt_while_loop_region : !trtrt.context
+//   CHECK-DAG:     %[[v4:.+]] = trtrt.get_function @trt_while_loop_region_0 : !trtrt.context
+//       CHECK:     %[[alloc_0:.+]] = memref.alloc() 
+//   CHECK-DAG:     memref.copy %[[v0]], %[[alloc_0]]
+//   CHECK-DAG:     %[[alloc_1:.+]] = memref.alloc() 
+//   CHECK-DAG:     memref.copy %[[v1]], %[[alloc_1]]
+//       CHECK:     %[[v5:.+]]:2 = scf.while (%[[arg1:.+]] = %[[alloc_0]], %[[arg2:.+]] = %[[alloc_1]]) : ({{.*}}) ->
+//   CHECK-DAG:       trtrt.enqueue %[[v3]] stream(%[[v2]]) (%[[arg1]]) outs(%[[alloc]]) 
+//   CHECK-DAG:       %[[alloc_2:.+]] = memref.alloc()
+//   CHECK-DAG:       memref.copy %[[alloc]], %[[alloc_2]] 
+//   CHECK-DAG:       %[[c0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:       %[[v6:.+]] = memref.load %[[alloc_2]][] 
+//   CHECK-DAG:       %[[alloc_3:.+]] = memref.alloc() 
+//   CHECK-DAG:       memref.copy %[[arg1]], %[[alloc_3]] 
+//   CHECK-DAG:       %[[alloc_4:.+]] = memref.alloc() 
+//   CHECK-DAG:       memref.copy %[[alloc_3]], %[[alloc_4]]
+//   CHECK-DAG:       %[[alloc_5:.+]] = memref.alloc() 
+//   CHECK-DAG:       memref.copy %[[arg2]], %[[alloc_5]]
+//   CHECK-DAG:       %[[alloc_6:.+]] = memref.alloc() {alignment = 16 : i64} : memref<1xf32, #plan.memory_space<device>>
+//   CHECK-DAG:       memref.copy %[[alloc_5]], %[[alloc_6]] :
+//       CHECK:       scf.condition(%[[v6]]) %[[alloc_4]], %[[alloc_6]] :
 //       CHECK:     } do {
-//       CHECK:     ^bb0(%[[arg1:.+]]: memref<1xi32, #plan.memory_space<device>>, %[[arg2:.+]]: memref<1xf32, #plan.memory_space<device>>):
-//       CHECK:       trtrt.enqueue %[[v4]] stream(%[[v2]]) (%[[arg0]], %[[arg1]], %[[arg2]]) outs(%[[arg1]], %[[arg2]])
-//       CHECK:       scf.yield %[[arg1]], %[[arg2]]
+//       CHECK:     ^bb0(%[[arg1:.+]]: memref<1xi32, #plan.memory_space<device>>, %[[arg2:.+]]: memref< 
+//   CHECK-DAG:       %[[alloc_2:.+]] = memref.alloc() 
+//   CHECK-DAG:       %[[alloc_3:.+]] = memref.alloc() 
+//   CHECK-DAG:       trtrt.enqueue %[[v4]] stream(%[[v2]]) (%[[arg0]], %[[arg1]], %[[arg2]]) outs(%[[alloc_2]], %[[alloc_3]]) :
+//       CHECK:       scf.yield %[[alloc_2]], %[[alloc_3]] :
 //       CHECK:     cuda.stream.sync %[[v2]]
 //       CHECK:     return %[[v5]]#1
 
diff --git a/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir b/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir
index d5e579d48..13f783fa5 100644
--- a/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir
+++ b/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir
@@ -15,6 +15,22 @@ func.func @enqueue_simple(
 
 // -----
 
+func.func @enqueue_alias(
+    %ctx: !trtrt.context, %stream: !cuda.stream,
+    %arg0: tensor<1x3x256x256xf32>) -> tensor<1x3x256x256xf32> {
+  %0 = tensor.empty() : tensor<1x3x256x256xf32>
+  %3 = trtrt.enqueue %ctx stream(%stream) (%arg0) outs(%arg0) : (tensor<1x3x256x256xf32>) -> tensor<1x3x256x256xf32>
+  return %3 : tensor<1x3x256x256xf32>
+}
+
+// CHECK-LABEL: func.func @enqueue_alias
+//  CHECK-SAME: (%[[arg0:.+]]: !trtrt.context, %[[arg1:.+]]: !cuda.stream, %[[arg2:.+]]: memref<
+//       CHECK:     %[[alloc:.+]] = memref.alloc() 
+//       CHECK:     trtrt.enqueue %[[arg0]] stream(%[[arg1]]) (%[[arg2]]) outs(%[[alloc]]) 
+//       CHECK:     return %[[alloc]] :
+
+// -----
+
 func.func @enqueue_host_tensors_space_check(
     %ctx: !trtrt.context, %stream: !cuda.stream,
     %arg0: tensor<4xi32>, %arg1: tensor<128xf32>) -> tensor<128xf32> {
diff --git a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg
index 79a2f2478..6d34f316c 100644
--- a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg
+++ b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg
@@ -1,7 +1,4 @@
 if not config.target_tensorrt:
   config.unsupported = True
-if not config.target_lua:
-  config.unsupported = True
 if not "host-has-at-least-1-gpus" in config.available_features:
     config.unsupported = True
-config.parallelism_group = "non-collective"
diff --git a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg
deleted file mode 100644
index 8e5a835a9..000000000
--- a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if (not config.target_lua or
-    not config.target_lua):
-  config.unsupported = True
diff --git a/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg
deleted file mode 100644
index feaf6adf8..000000000
--- a/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not config.target_lua:
-  config.unsupported = True
diff --git a/mlir-tensorrt/compiler/test/lit.cfg.py b/mlir-tensorrt/compiler/test/lit.cfg.py
index f94a170c1..56504d3c9 100644
--- a/mlir-tensorrt/compiler/test/lit.cfg.py
+++ b/mlir-tensorrt/compiler/test/lit.cfg.py
@@ -6,12 +6,13 @@
 import sys
 from pathlib import Path
 
-import lit.formats
-import lit.util
+from lit.LitConfig import LitConfig
 from lit.llvm import llvm_config
 from lit.llvm.subst import ToolSubst
-from lit.LitConfig import LitConfig
 from lit.TestingConfig import TestingConfig
+import lit.formats
+import lit.util
+import psutil
 
 config: TestingConfig = config  # type: ignore
 lit_config: LitConfig = lit_config  # type: ignore
@@ -46,10 +47,21 @@ def load_gpu_tools_module():
 gpu_tools = load_gpu_tools_module()
 
 
-def estimate_paralllelism(mem_required: float) -> int:
+def estimate_paralllelism(
+    gb_gpu_mem_required: float, gb_sys_mem_required: float
+) -> int:
     try:
+        parallelism = 2
         with gpu_tools.nvml_context() as devices:
-            return gpu_tools.estimate_parallelism_from_memory(devices, mem_required)
+            parallelism = gpu_tools.estimate_parallelism_from_memory(
+                devices, gb_gpu_mem_required
+            )
+        return int(
+            min(
+                parallelism,
+                (psutil.virtual_memory().available / (1024**3)) // gb_sys_mem_required,
+            )
+        )
     except:
         return 2
 
@@ -69,11 +81,17 @@ def estimate_paralllelism(mem_required: float) -> int:
 )
 config.substitutions.append(("%trt_lib_dir", config.tensorrt_lib_dir))
 
-# Setup the parallelism groups.
-lit_config.parallelism_groups["non-collective"] = estimate_paralllelism(2.0)
-lit_config.parallelism_groups["collective"] = 1
-lit_config.parallelism_groups["models"] = estimate_paralllelism(8.0)
-lit_config.parallelism_group = None
+# Setup the parallelism groups. Note that just instantiating the TRT builder
+# requires ~2.5 GB of system memory, so we use 3.0 as a baseline limit.
+lit_config.parallelism_groups["default"] = estimate_paralllelism(
+    2.0, gb_sys_mem_required=3.0
+)
+lit_config.parallelism_groups["models"] = estimate_paralllelism(
+    8.0, gb_sys_mem_required=4.0
+)
+lit_config.parallelism_groups["heavy"] = 1
+
+lit_config.parallelism_group = "default"
 
 print(f"Parallelism Groups: {lit_config.parallelism_groups}", file=sys.stderr)
 
diff --git a/mlir-tensorrt/compiler/test/lit.site.cfg.py.in b/mlir-tensorrt/compiler/test/lit.site.cfg.py.in
index d8a048b00..c8fdafcd9 100644
--- a/mlir-tensorrt/compiler/test/lit.site.cfg.py.in
+++ b/mlir-tensorrt/compiler/test/lit.site.cfg.py.in
@@ -24,10 +24,7 @@ config.python_executable = "@Python3_EXECUTABLE@"
 config.enable_hlo = @MLIR_TRT_ENABLE_HLO@
 config.target_tensorrt = @MLIR_TRT_TARGET_TENSORRT@
 config.mlir_tensorrt_compile_time_version = "@MLIR_TRT_TENSORRT_VERSION@"
-config.target_lua = @MLIR_TRT_TARGET_LUA@
 config.enable_nccl = @MLIR_TRT_ENABLE_NCCL@
-config.target_lua = @MLIR_TRT_TARGET_LUA@
-config.enable_executor = @MLIR_TRT_ENABLE_EXECUTOR@
 
 config.enable_asan = @ENABLE_ASAN@
 config.tensorrt_lib_dir = "@MLIR_TRT_TENSORRT_LIB_DIR@"
diff --git a/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg b/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg
index 9e414568f..8e80f44d9 100644
--- a/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg
+++ b/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg
@@ -2,3 +2,5 @@ if not config.enable_bindings_python:
     config.unsupported = True
 if not "host-has-at-least-1-gpus" in config.available_features:
     config.unsupported = True
+
+config.parallelism_group = "heavy"
diff --git a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py
index 17bda29d4..26fa48c8b 100644
--- a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py
+++ b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py
@@ -40,6 +40,8 @@ def compile_asm(ASM):
                 f"--mlir-print-ir-tree-dir={mlir_tree_path.name}",
                 f"--tensorrt-layer-info-dir={trt_path.name}",
                 f"--tensorrt-engines-dir={trt_path.name}",
+                "--mlir-elide-elementsattrs-if-larger=1024",
+                "--mlir-elide-resource-strings-if-larger=1024",
             ],
         )
         task.run(m.operation)
diff --git a/mlir-tensorrt/compiler/tools/CMakeLists.txt b/mlir-tensorrt/compiler/tools/CMakeLists.txt
index def21c3c3..fe43ec083 100644
--- a/mlir-tensorrt/compiler/tools/CMakeLists.txt
+++ b/mlir-tensorrt/compiler/tools/CMakeLists.txt
@@ -16,6 +16,4 @@ set(LLVM_LINK_COMPONENTS
 add_subdirectory(mlir-tensorrt-opt)
 add_subdirectory(mlir-tensorrt-translate)
 add_subdirectory(mlir-tensorrt-lsp-server)
-if(MLIR_TRT_TARGET_LUA)
-  add_subdirectory(mlir-tensorrt-runner)
-endif()
+add_subdirectory(mlir-tensorrt-runner)
diff --git a/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp b/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp
index 63597bf24..3fb3a9205 100644
--- a/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp
+++ b/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp
@@ -23,9 +23,9 @@
 //===----------------------------------------------------------------------===//
 #include "mlir-tensorrt-dialect/Target/Passes.h"
 #include "mlir-tensorrt-dialect/Target/TranslateToTensorRT.h"
-#include "mlir-tensorrt/Registration/InitLLVMExtensions.h"
-#include "mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h"
-#include "mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h"
+#include "mlir-tensorrt/InitAllDialects.h"
+#include "mlir-tensorrt/InitAllExtensions.h"
+#include "mlir-tensorrt/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 
 using namespace mlir;
@@ -50,16 +50,15 @@ static void registerTestPasses() {
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
-  mlir::registerAllMlirTensorRtDialects(registry);
-
-  mlirtrt::registerConvertToLLVMExtensions(registry);
+  mlirtrt::compiler::registerAllDialects(registry);
+  mlirtrt::compiler::registerAllExtensions(registry);
 
   mlir::registerTestTensorRTShapeInferencePass();
 #ifdef MLIR_TRT_TARGET_TENSORRT
   mlir::tensorrt::registerTensorRTTranslationCLOpts();
   mlir::tensorrt::registerTensorRTTranslationPasses();
 #endif
-  mlir::tensorrt::registerAllMlirTensorRtPasses();
+  mlirtrt::compiler::registerAllPasses();
 #ifdef MLIR_TRT_ENABLE_TESTING
   registerTestPasses();
 #endif
diff --git a/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp b/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp
index 12cb5c6f0..a4621a0cc 100644
--- a/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp
+++ b/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp
@@ -21,6 +21,9 @@
 // This file is the entry point for the `mlir-tensorrt-translate` tool.
 //
 //===----------------------------------------------------------------------===//
+#include "mlir-executor/Target/Lua/TranslateToLua.h"
+#include "mlir-executor/Target/Lua/TranslateToRuntimeExecutable.h"
+#include "mlir-tensorrt/Features.h"
 #include "mlir/InitAllTranslations.h"
 #include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
 
@@ -28,23 +31,15 @@
 #include "mlir-tensorrt-dialect/Target/TranslateToTensorRT.h"
 #endif // MLIR_TRT_TARGET_TENSORRT
 
-#ifdef MLIR_TRT_TARGET_LUA
-#include "mlir-executor/Target/Lua/TranslateToLua.h"
-#include "mlir-executor/Target/Lua/TranslateToRuntimeExecutable.h"
-#endif // MLIR_TRT_TARGET_LUA
-
 int main(int argc, char **argv) {
   mlir::registerToCppTranslation();
-
-#ifdef MLIR_TRT_TARGET_TENSORRT
-  mlir::tensorrt::registerTensorRTTranslationCLOpts();
-  mlir::registerToTensorRTTranslation();
-#endif // MLIR_TRT_TARGET_TENSORRT
-
-#ifdef MLIR_TRT_TARGET_LUA
   mlir::registerToLuaTranslation();
   mlir::registerToRuntimeExecutableTranslation();
-#endif // MLIR_TRT_TARGET_LUA
+
+  IF_MLIR_TRT_TARGET_TENSORRT({
+    mlir::tensorrt::registerTensorRTTranslationCLOpts();
+    mlir::registerToTensorRTTranslation();
+  });
 
   return failed(mlir::mlirTranslateMain(argc, argv,
                                         "MLIR-TensorRT Translation Tool"))
diff --git a/mlir-tensorrt/executor/CMakeLists.txt b/mlir-tensorrt/executor/CMakeLists.txt
index 664ab5199..7a934de5f 100644
--- a/mlir-tensorrt/executor/CMakeLists.txt
+++ b/mlir-tensorrt/executor/CMakeLists.txt
@@ -15,7 +15,6 @@ mlir_executor_option(MLIR_EXECUTOR_ENABLE_NCCL "Enable use of NCCL in the runtim
 mlir_executor_option(MLIR_EXECUTOR_ENABLE_MPI "Enable use of MPI in the runtime" ON)
 mlir_executor_option(MLIR_EXECUTOR_ENABLE_TENSORRT "Enable TensorRT runtime module" ON)
 mlir_executor_option(MLIR_EXECUTOR_ENABLE_CUDA "Enable CUDA runtime module" ON)
-mlir_executor_option(MLIR_EXECUTOR_TARGET_LUA "Enable Lua translations and runtime backend" ON)
 mlir_executor_option(MLIR_EXECUTOR_ENABLE_GPU_INTEGRATION_TESTS "Enable integration tests that require GPU" ON)
 
 #-------------------------------------------------------------------------------
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt b/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt
index b8ba504cd..16c80d4c2 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt
@@ -1,5 +1,3 @@
 add_subdirectory(Common)
 add_subdirectory(C)
-if(MLIR_EXECUTOR_TARGET_LUA)
-  add_subdirectory(Lua)
-endif()
+add_subdirectory(Lua)
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
index dfe6d9360..420e0846a 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -61,6 +61,7 @@ using namespace mlirtrt::runtime;
 
 static constexpr uint64_t kMinConstantBufferByteAlignment = 8;
 
+#ifndef MLIR_EXECUTOR_ENABLE_NCCL
 /// If the runtime is not built with MLIR_EXECUTOR_ENABLE_NCCL, then this
 /// function registers default implementations for the required SPMD functions,
 /// reflecting that the executable is expected to run against a single fixed
@@ -76,6 +77,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state,
     return deviceIdx;
   };
 }
+#endif // MLIR_EXECUTOR_ENABLE_NCCL
 
 namespace mlirtrt::runtime {
 void registerLuaCoreRuntimeExtension();
diff --git a/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp b/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp
index 072b0414f..2918cf11b 100644
--- a/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp
+++ b/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp
@@ -842,9 +842,9 @@ LogicalResult LuaEmitter::emitBlock(Block &block, bool isEntryBlock) {
       // block if they are used outside of the block.
       for (Operation &op : otherBlock) {
         for (Value result : op.getResults()) {
-          bool usedOutside =
-              llvm::any_of(result.getUsers(), [&](Operation *userOp) {
-                return userOp->getBlock() != &otherBlock;
+          bool usedOutside = llvm::any_of(
+              result.getUsers(), [otherBlock = &otherBlock](Operation *userOp) {
+                return userOp->getBlock() != otherBlock;
               });
           if (usedOutside) {
             getStream() << "local " << createLocalVariableName(result)
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt
index 441579e1b..60510e5d1 100644
--- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt
+++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt
@@ -1,9 +1,6 @@
 include_directories(${MLIR_INCLUDE_DIRS})
 set(_TABLEGEN_ARGS -gen-pass-decls -name TensorRTTranslation)
 set(LLVM_TARGET_DEFINITIONS Passes.td)
-if(MLIR_TRT_TARGET_LUA)
-  list(APPEND _TABLEGEN_ARGS -DMLIR_TRT_TARGET_LUA)
-endif()
 mlir_tablegen(Passes.h.inc ${_TABLEGEN_ARGS})
 add_public_tablegen_target(MLIRTensorRTTranslationPassIncGen)
 
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h
index 2d3b1e758..5c41798b5 100644
--- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h
+++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h
@@ -41,21 +41,6 @@ class Operation;
 class Pass;
 namespace tensorrt {
 
-/// A simple logger that implements TensorRT's logging interface. Errors and
-/// warnings are reported stderr. If the 'verbose' flags is active, then all
-/// messages are printed to stderr.
-class Logger : public nvinfer1::ILogger {
-public:
-  explicit Logger(bool verbose = false) : verbose(verbose) {}
-
-protected:
-  void log(Severity severity, const char *msg) noexcept override;
-
-  /// Print only 'error' and 'warning' messages if false, otehrwise print all
-  /// messages.
-  bool verbose;
-};
-
 /// A llvm::cl::opt parser for turning strings like "1024gb" into a number of
 /// bytes. Allowed suffixes are strings like 'gb', 'GiB', 'kb', 'mb', 'b' (case
 /// insensitive, we interpret both 'b|B' as meaning "byte"). This example comes
@@ -128,10 +113,8 @@ struct TensorRTTranslationOptions {
 class TensorRTBuilderContext {
 private:
   TensorRTBuilderContext(TensorRTVersion version, int32_t cudaDevice,
-                         std::unique_ptr<Logger> logger,
                          std::unique_ptr<nvinfer1::IBuilder> builder)
-      : version(version), cudaDevice(cudaDevice), logger(std::move(logger)),
-        builder(std::move(builder)) {}
+      : version(version), cudaDevice(cudaDevice), builder(std::move(builder)) {}
 
 public:
   /// Create a TensorRTBuilderContext from a log configuration and CUDA device
@@ -157,14 +140,10 @@ class TensorRTBuilderContext {
   /// Return which CUDA device the builder is associated with.
   int32_t getCudaDeviceNumber() const { return cudaDevice; }
 
-  /// Return a handle to the logger.
-  const std::unique_ptr<Logger> &getLogger() const { return logger; }
-
 private:
   TensorRTVersion version;
   /// The CUDA device that this builder is associated with.
   int32_t cudaDevice;
-  std::unique_ptr<Logger> logger;
   std::unique_ptr<nvinfer1::IBuilder> builder;
 };
 
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h
index 987bca0b1..0b8539f0e 100644
--- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h
+++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h
@@ -43,6 +43,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstdio>
+#include <cstring>
 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -181,6 +182,7 @@ Weights trtSetWeights(WeightsMap &weightsMap, const char *name,
                       const std::vector<T> &w) {
   weightsMap[name] = std::vector<uint8_t>(w.size() * sizeof(T));
   std::vector<uint8_t> &data = weightsMap[name];
+  std::memcpy(data.data(), w.data(), w.size());
   DataType dt = DataType::kFLOAT;
   if (std::is_same<T, float>::value) {
     dt = DataType::kFLOAT;
diff --git a/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp b/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp
index 288f7e573..2a8419c6c 100644
--- a/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp
+++ b/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp
@@ -151,6 +151,11 @@ LogicalResult TensorKindAnalysis::visitOperation(
     return success();
   }
 
+  if (auto tensorInsertOp = dyn_cast<tensor::InsertOp>(op)) {
+    setInferredType(tensorInsertOp.getDestMutable(), TensorKind::Host);
+    return success();
+  }
+
   if (auto bufferizeOp = dyn_cast<bufferization::AllocTensorOp>(op)) {
     // It has no tensor operands, nothing to do.
     if (!bufferizeOp.getCopy() || !bufferizeOp.getMemorySpace()) {
@@ -158,7 +163,6 @@ LogicalResult TensorKindAnalysis::visitOperation(
     }
     if (auto memSpace = dyn_cast_or_null<TensorKindAttrInterface>(
             bufferizeOp.getMemorySpaceAttr())) {
-
       if (memSpace.getTensorKind().isHostOnly()) {
         setInferredType(bufferizeOp.getCopyMutable()[0], TensorKind::Device);
         return success();
diff --git a/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp b/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp
index 3a07fdd69..404bfe7eb 100644
--- a/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp
+++ b/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include <mutex>
 
 #define DEBUG_TYPE "translate-to-tensorrt"
 #define DBGS() llvm::dbgs() << "[" DEBUG_TYPE "] "
@@ -63,6 +64,61 @@ namespace tensorrt {
 using namespace mlir;
 using namespace mlir::tensorrt;
 
+//===----------------------------------------------------------------------===//
+// Global TensorRT Logger
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A simple logger that implements TensorRT's logging interface. Errors and
+/// warnings are reported stderr. If the 'verbose' flags is active, then all
+/// messages are printed to stderr.
+class Logger : public nvinfer1::ILogger {
+public:
+  static Logger &getInstance(bool verbose) {
+    static Logger instance;
+    instance.setVerbose(verbose);
+    return instance;
+  }
+
+  void setVerbose(bool verbose) {
+    std::scoped_lock<std::mutex> guard(lock);
+    this->verbose = verbose;
+  }
+
+protected:
+  Logger() = default;
+  Logger(const Logger &) = delete;
+  Logger &operator=(const Logger &) = delete;
+
+  void log(Severity severity, const char *msg) noexcept override;
+
+  /// Print only 'error' and 'warning' messages if false, otehrwise print all
+  /// messages.
+  bool verbose;
+
+  std::mutex lock;
+};
+} // namespace
+
+void Logger::log(Severity severity, const char *msg) noexcept {
+  if (verbose) {
+    std::scoped_lock<std::mutex> g(lock);
+    llvm::errs() << msg << "\n";
+    return;
+  }
+
+  if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR ||
+      severity == Severity::kWARNING) {
+    std::scoped_lock<std::mutex> g(lock);
+    llvm::errs() << msg << "\n";
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ByteSizeParser
+//===----------------------------------------------------------------------===//
+
 bool ByteSizeParser::parse(llvm::cl::Option &option, StringRef argName,
                            StringRef arg, std::optional<uint64_t> &val) {
   val = std::nullopt;
@@ -243,23 +299,6 @@ TensorRTTranslationOptions TensorRTTranslationOptions::fromCLFlags() {
   return options;
 }
 
-//===----------------------------------------------------------------------===//
-// Logger
-//===----------------------------------------------------------------------===//
-
-void tensorrt::Logger::log(Severity severity, const char *msg) noexcept {
-  if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
-    llvm::errs() << msg << "\n";
-    return;
-  }
-  if (severity == Severity::kWARNING) {
-    llvm::errs() << msg << "\n";
-    return;
-  }
-  if (verbose)
-    llvm::errs() << msg << "\n";
-}
-
 //===----------------------------------------------------------------------===//
 // TensorRTBuilderContext
 //===----------------------------------------------------------------------===//
@@ -292,17 +331,13 @@ TensorRTBuilderContext::create(bool verbose, int32_t cudaDevice) {
   if (status != cudaSuccess)
     return failure();
 
-  auto logger = std::make_unique<Logger>(verbose);
-  if (!logger)
-    return failure();
-
   auto builder = std::unique_ptr<nvinfer1::IBuilder>(
-      nvinfer1::createInferBuilder(*logger));
+      nvinfer1::createInferBuilder(Logger::getInstance(verbose)));
   if (!builder)
     return failure();
 
-  return std::shared_ptr<TensorRTBuilderContext>(new TensorRTBuilderContext(
-      version, cudaDevice, std::move(logger), std::move(builder)));
+  return std::shared_ptr<TensorRTBuilderContext>(
+      new TensorRTBuilderContext(version, cudaDevice, std::move(builder)));
 }
 
 //===----------------------------------------------------------------------===//
@@ -830,7 +865,8 @@ class TranslateToTensorRTEnginePass
 
       if (!translationOptions->saveTensorRTLayerInfoDirectory.empty()) {
         std::unique_ptr<nvinfer1::IRuntime> runtime{
-            nvinfer1::createInferRuntime(*builderContext->getLogger())};
+            nvinfer1::createInferRuntime(
+                Logger::getInstance(translationOptions->enableVerboseLogs))};
         std::unique_ptr<nvinfer1::ICudaEngine> cudaEngine{
             runtime->deserializeCudaEngine(serializedEngine->data(),
                                            serializedEngine->size())};
diff --git a/mlir-tensorrt/tensorrt/test/lit.cfg.py b/mlir-tensorrt/tensorrt/test/lit.cfg.py
index a98a6b41e..32e80a712 100644
--- a/mlir-tensorrt/tensorrt/test/lit.cfg.py
+++ b/mlir-tensorrt/tensorrt/test/lit.cfg.py
@@ -5,9 +5,9 @@
 
 import lit.formats
 import lit.util
-
 from lit.llvm import llvm_config
 from lit.llvm.subst import ToolSubst
+import psutil
 
 # Configuration file for the 'lit' test runner.
 
@@ -114,14 +114,27 @@ def all_gpus_have_fp8_support() -> bool:
     config.available_features.add("no-asan")
 
 
-def estimate_parallelism(mem_required: float) -> int:
+def estimate_paralllelism(
+    gb_gpu_mem_required: float, gb_sys_mem_required: float
+) -> int:
     try:
+        parallelism = 2
         with gpu_tools.nvml_context() as devices:
-            return gpu_tools.estimate_parallelism_from_memory(devices, mem_required)
+            parallelism = gpu_tools.estimate_parallelism_from_memory(
+                devices, gb_gpu_mem_required
+            )
+        return int(
+            min(
+                parallelism,
+                (psutil.virtual_memory().available / (1024**3)) // gb_sys_mem_required,
+            )
+        )
     except:
-        return 1
+        return 2
 
 
 # Setup the parallelism groups.
-lit_config.parallelism_groups["translation-tests"] = estimate_parallelism(8.0)
+lit_config.parallelism_groups["translation-tests"] = estimate_paralllelism(
+    8.0, gb_sys_mem_required=3.0
+)
 lit_config.parallelism_group = None