diff --git a/mlir-tensorrt/.gitignore b/mlir-tensorrt/.gitignore index e410819ad..8f46753fb 100644 --- a/mlir-tensorrt/.gitignore +++ b/mlir-tensorrt/.gitignore @@ -3,6 +3,7 @@ *.log **/llvm-project/** **/llvm-project/ +CMakeUserPresets.json # Docs build artifacts /public/ diff --git a/mlir-tensorrt/CMakeLists.txt b/mlir-tensorrt/CMakeLists.txt index 0a396a862..b00019933 100644 --- a/mlir-tensorrt/CMakeLists.txt +++ b/mlir-tensorrt/CMakeLists.txt @@ -90,6 +90,35 @@ if(PROJECT_IS_TOP_LEVEL) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) endif() +# ------------------------------------------------- +# Option validation +# ------------------------------------------------- +# Write out a header file containing convenience macros for each flag. +function(mtrt_write_feature_flags_header) + set(feature_flags_header + "${CMAKE_CURRENT_BINARY_DIR}/include/mlir-tensorrt/Features.h") + + # Generate the header at configure time + file(WRITE "${feature_flags_header}" [[ + // Auto-generated feature macros, do not edit. + #ifndef MLIR_TENSORRT_FEATURES_H + #define MLIR_TENSORRT_FEATURES_H + + ]]) + + foreach(FEATURE IN LISTS MLIR_TRT_FEATURE_FLAGS) + file(APPEND "${feature_flags_header}" "#ifdef ${FEATURE}\n") + file(APPEND "${feature_flags_header}" "#define IF_${FEATURE}(code) do { code } while (0)\n") + file(APPEND "${feature_flags_header}" "#else\n") + file(APPEND "${feature_flags_header}" "#define IF_${FEATURE}(code) do {} while (0)\n") + file(APPEND "${feature_flags_header}" "#endif // ${FEATURE}\n\n") + endforeach() + file(APPEND "${feature_flags_header}" "#endif // MLIR_TENSORRT_FEATURES_H\n") +endfunction() + +mtrt_write_feature_flags_header() +include_directories("${CMAKE_CURRENT_BINARY_DIR}/include") + # ------------------------------------------------- # Setup LLVM/MLIR # ------------------------------------------------- diff --git a/mlir-tensorrt/build_tools/cmake/Targets.cmake b/mlir-tensorrt/build_tools/cmake/Targets.cmake index 9e361c783..4a16e0e91 100644 --- a/mlir-tensorrt/build_tools/cmake/Targets.cmake +++ b/mlir-tensorrt/build_tools/cmake/Targets.cmake @@ -44,6 +44,16 @@ function(add_mlir_tensorrt_public_c_api_library target) endif() endfunction() +# -------------------------------------------------------------- +# Adds an upstream MLIR library target to the +# MLIR_TENSORRT_LIBS global property list to capture it as an +# implicit dependency for all final tools and compiler +# end-user products. +# -------------------------------------------------------------- +function(add_mlir_tensorrt_compiler_dependency target) + set_property(GLOBAL APPEND PROPERTY MLIR_TENSORRT_LIBS ${target}) +endfunction() + # ------------------------------------------------------------------------------ # A wrapper around `add_mlir_dialect_library` that also appends the dialect # library to the global `MLIR_TENSORRT_DIALECT_LIBS` list property. @@ -99,11 +109,11 @@ function(add_mlir_tensorrt_backend_library target) BASE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) set(LLVM_TARGET_DEFINITIONS "${SRC_TD}") - + string(REPLACE ".td" "Attrs.h.inc" h_inc_file ${BIN_TD}) string(REPLACE ".td" "Attrs.cpp.inc" cpp_inc_file ${BIN_TD}) mlir_tablegen("${h_inc_file}" -gen-attrdef-decls) - mlir_tablegen("${cpp_inc_file}" -gen-attrdef-defs) + mlir_tablegen("${cpp_inc_file}" -gen-attrdef-defs) add_public_tablegen_target(${target}IncGen) diff --git a/mlir-tensorrt/compiler/CMakeLists.txt b/mlir-tensorrt/compiler/CMakeLists.txt index d4ede3388..e039c9b8a 100644 --- a/mlir-tensorrt/compiler/CMakeLists.txt +++ b/mlir-tensorrt/compiler/CMakeLists.txt @@ -9,6 +9,29 @@ set(MLIR_TENSORRT_COMPILER_INCLUDE_DIRS include_directories("${CMAKE_CURRENT_BINARY_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/include") +# We use the MLIR_TENSORRT_LIBS global property to aggregate the list of all +# declared compiler libraries. This is helpful for simplifying the link +# dependency declarations for tools that must link "the world" like +# `mlir-tensorrt-opt`. + +# Because MLIR has a level of indirection that lets implementation for +# interfaces be provided by separate implementation code +# ("PromisedInterfaces/ExternalModels") which is registered at runtime, it is +# difficult to capture all dependencies for dialects we require in the compiler +# purely through target dependency properties. To see what we require from +# usptream, look at the file `mlir-tensorrt/InitAllDialects.h`. Therefore, we +# manually enumerate some dependencies here, mainly for providing the functions +# registering interface external models. +add_mlir_tensorrt_compiler_dependency(MLIRArithTransforms) +add_mlir_tensorrt_compiler_dependency(MLIRArithValueBoundsOpInterfaceImpl) +add_mlir_tensorrt_compiler_dependency(MLIRAsyncDialect) +add_mlir_tensorrt_compiler_dependency(MLIRBufferizationTransforms) +add_mlir_tensorrt_compiler_dependency(MLIRControlFlowTransforms) +add_mlir_tensorrt_compiler_dependency(MLIRNVVMTarget) +add_mlir_tensorrt_compiler_dependency(MLIRPtrDialect) +add_mlir_tensorrt_compiler_dependency(MLIRTargetLLVM) +add_mlir_tensorrt_compiler_dependency(MLIRTensorTransformOps) + add_subdirectory(include) add_subdirectory(lib) add_subdirectory(test) diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h index 264963dcf..72e828b90 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/OptionsProviders.h @@ -113,6 +113,20 @@ struct DebugOptions : public OptionsProvider { "tree rooted at this directory. Use in conjunction with " "mlir-print-ir-* flags")}; + //===----------------------------------------------------------------------===// + // Printing Flags + //===----------------------------------------------------------------------===// + + Option elideElementsAttrIfLarger{ + this->ctx, "mlir-elide-elementsattrs-if-larger", + llvm::cl::desc("Elide ElementsAttrs with \"...\" that have " + "more elements than the given upper limit")}; + + Option elideResourceStringsIfLarger{ + this->ctx, "mlir-elide-resource-strings-if-larger", + llvm::cl::desc( + "Elide printing value of resources if string is too long in chars.")}; + //===--------------------------------------------------------------------===// // Pass Statistics //===--------------------------------------------------------------------===// diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt index 816db9846..3c5812249 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/CMakeLists.txt @@ -6,9 +6,6 @@ set(_TABLEGEN_ARGS ) if(MLIR_TRT_ENABLE_HLO) list(APPEND _TABLEGEN_ARGS -DMLIR_TENSORRT_ENABLE_HLO) endif() -if(MLIR_TRT_ENABLE_EXECUTOR) - list(APPEND _TABLEGEN_ARGS -DMLIR_TENSORRT_ENABLE_EXECUTOR) -endif() mlir_tablegen(Passes.h.inc -gen-pass-decls -name MLIRTensorRTConversion ${_TABLEGEN_ARGS}) add_public_tablegen_target(MLIRTensorRTConversionPassIncGen) diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td index 19905e537..b7d3d014c 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td @@ -125,8 +125,6 @@ def ConvertTensorRTToEmitCPass : Pass<"convert-tensorrt-to-emitc", let dependentDialects = ["::mlir::emitc::EmitCDialect"]; } -#ifdef MLIR_TENSORRT_ENABLE_EXECUTOR - //===----------------------------------------------------------------------===// // ConvertMemRefToCUDAPass //===----------------------------------------------------------------------===// @@ -312,9 +310,6 @@ def ConvertTensorRTRuntimeToExecutorPass : Pass<"convert-tensorrt-runtime-to-exe let options = ConvertToExecutorOptions; } - -#endif // MLIR_TENSORRT_ENABLE_EXECUTOR - #ifdef MLIR_TENSORRT_ENABLE_HLO //===----------------------------------------------------------------------===// // ConvertStablehloToScfPass diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt index 494c94a4a..ab5c79a76 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CMakeLists.txt @@ -1,10 +1,7 @@ if(MLIR_TRT_ENABLE_HLO) add_subdirectory(StablehloExt) - add_subdirectory(Plan) endif() +add_subdirectory(CUDA) +add_subdirectory(Plan) add_subdirectory(TensorRTRuntime) - -if(MLIR_TRT_ENABLE_EXECUTOR) - add_subdirectory(CUDA) -endif() diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td index a1334f0fd..cf22de6c3 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td @@ -428,6 +428,27 @@ def PostClusteringValidationPass : Pass<"post-clustering-validation", "func::Fun }]; } +//===----------------------------------------------------------------------===// +// PlanAssignMemorySpacesPass +//===----------------------------------------------------------------------===// + +def PlanAssignMemorySpacesPass : Pass<"plan-assign-memory-spaces", + "::mlir::ModuleOp"> { + let summary = "assigns memory spaces encodings to tensor types"; + + let description = [{ + This pass applies a type conversion that adds a '#plan.memory_space' + attribute to all tensor types in the top-level module that do not already + have an encoding. + }]; + + let dependentDialects = [ + "::mlir::plan::PlanDialect", + "::mlir::bufferization::BufferizationDialect", + "::mlir::tensor::TensorDialect" + ]; +} + //===----------------------------------------------------------------------===// // PlanAllocTensorsPass //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h new file mode 100644 index 000000000..51850b4ba --- /dev/null +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllDialects.h @@ -0,0 +1,199 @@ +//===- InitAllDialects.h ----------------------------------------*- C++ -*-===// +// +// SPDX-FileCopyrightText: Copyright 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// Registration methods for MLIR dialects. +/// +//===----------------------------------------------------------------------===// +#ifndef MLIR_TENSORRT_INIT_ALL_DIALECTS +#define MLIR_TENSORRT_INIT_ALL_DIALECTS + +#include "mlir-executor/Executor/IR/Executor.h" +#include "mlir-tensorrt-dialect/TensorRT/IR/TensorRTDialect.h" +#include "mlir-tensorrt-dialect/TensorRT/Target/TensorRTEncodingImpl.h" +#include "mlir-tensorrt/Backends/Host/HostBackend.h" +#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h" +#include "mlir-tensorrt/Dialect/CUDA/IR/CUDADialect.h" +#include "mlir-tensorrt/Dialect/CUDA/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h" +#include "mlir-tensorrt/Dialect/StablehloExt/IR/StableHloExt.h" +#include "mlir-tensorrt/Dialect/TensorRTRuntime/IR/TensorRTRuntime.h" +#include "mlir-tensorrt/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir-tensorrt/Features.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h" +#include "mlir/Dialect/Arith/Transforms/BufferViewFlowOpInterfaceImpl.h" +#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Async/IR/Async.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.h" +#include "mlir/Dialect/ControlFlow/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/DLTI/DLTI.h" +#include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h" +#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h" +#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h" +#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/IR/MemRefMemorySlot.h" +#include "mlir/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h" +#include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h" +#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h" +#include "mlir/Dialect/PDL/IR/PDL.h" +#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h" +#include "mlir/Dialect/Ptr/IR/PtrDialect.h" +#include "mlir/Dialect/Quant/IR/Quant.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.h" +#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Shape/IR/Shape.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h" +#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h" +#include "mlir/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h" +#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/UB/IR/UBOps.h" +#include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h" +#include "mlir/Interfaces/CastInterfaces.h" +#include "mlir/Target/LLVM/NVVM/Target.h" + +#ifdef MLIR_TRT_ENABLE_HLO +#include "stablehlo/dialect/ChloOps.h" +#include "stablehlo/dialect/StablehloOps.h" +#include "stablehlo/dialect/VhloOps.h" +#endif + +namespace mlirtrt::compiler { + +inline void registerAllDialects(mlir::DialectRegistry ®istry) { + // clang-format off + registry.insert< + mlir::affine::AffineDialect, + mlir::arith::ArithDialect, + mlir::async::AsyncDialect, + mlir::bufferization::BufferizationDialect, + mlir::cf::ControlFlowDialect, + mlir::complex::ComplexDialect, + mlir::cuda::CUDADialect, + mlir::DLTIDialect, + mlir::emitc::EmitCDialect, + mlir::executor::ExecutorDialect, + mlir::func::FuncDialect, + mlir::gpu::GPUDialect, + mlir::index::IndexDialect, + mlir::linalg::LinalgDialect, + mlir::LLVM::LLVMDialect, + mlir::math::MathDialect, + mlir::memref::MemRefDialect, + mlir::NVVM::NVVMDialect, + mlir::pdl_interp::PDLInterpDialect, + mlir::pdl::PDLDialect, + mlir::plan::PlanDialect, + mlir::ptr::PtrDialect, + mlir::quant::QuantDialect, + mlir::scf::SCFDialect, + mlir::shape::ShapeDialect, + mlir::tensor::TensorDialect, + mlir::tensorrt::TensorRTDialect, + mlir::transform::TransformDialect, + mlir::trtrt::TensorRTRuntimeDialect, + mlir::ub::UBDialect, + mlir::vector::VectorDialect + >(); + // clang-format on + + IF_MLIR_TRT_ENABLE_HLO({ + registry.insert(); + registry.insert(); + registry.insert(); + }); + + // Register all external models. + mlir::affine::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::arith::registerBufferDeallocationOpInterfaceExternalModels(registry); + mlir::arith::registerBufferizableOpInterfaceExternalModels(registry); + mlir::arith::registerBufferViewFlowOpInterfaceExternalModels(registry); + mlir::arith::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels( + registry); + mlir::builtin::registerCastOpInterfaceExternalModels(registry); + mlir::cf::registerBufferDeallocationOpInterfaceExternalModels(registry); + mlir::cf::registerBufferizableOpInterfaceExternalModels(registry); + mlir::cuda::registerBufferizableOpInterfaceExternalModels(registry); + mlir::linalg::registerBufferizableOpInterfaceExternalModels(registry); + mlir::linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry); + mlir::linalg::registerSubsetOpInterfaceExternalModels(registry); + mlir::linalg::registerTilingInterfaceExternalModels(registry); + mlir::linalg::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::LLVM::registerInlinerInterface(registry); + mlir::memref::registerAllocationOpInterfaceExternalModels(registry); + mlir::memref::registerBufferViewFlowOpInterfaceExternalModels(registry); + mlir::memref::registerMemorySlotExternalModels(registry); + mlir::memref::registerRuntimeVerifiableOpInterfaceExternalModels(registry); + mlir::memref::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::NVVM::registerInlinerInterface(registry); + mlir::NVVM::registerNVVMTargetInterfaceExternalModels(registry); + mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(registry); + mlir::scf::registerBufferizableOpInterfaceExternalModels(registry); + mlir::scf::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry); + mlir::tensor::registerFindPayloadReplacementOpInterfaceExternalModels( + registry); + mlir::tensor::registerInferTypeOpInterfaceExternalModels(registry); + mlir::tensor::registerSubsetOpInterfaceExternalModels(registry); + mlir::tensor::registerTilingInterfaceExternalModels(registry); + mlir::tensor::registerValueBoundsOpInterfaceExternalModels(registry); + mlir::tensorrt::registerTensorKindOpInterfaceExternalModels(registry); + mlir::tensorrt::registerTensorRTEncodingOpInterfaceExternalModels(registry); + mlir::trtrt::registerBufferizableOpInterfaceExternalModels(registry); + mlir::vector::registerBufferizableOpInterfaceExternalModels(registry); + mlir::vector::registerSubsetOpInterfaceExternalModels(registry); + mlir::vector::registerValueBoundsOpInterfaceExternalModels(registry); + + IF_MLIR_TRT_ENABLE_HLO({ + mlir::stablehlo::registerTensorKindOpInterfaceExternalModels(registry); + mlir::stablehlo::registerTypeInferenceExternalModels(registry); + }); +} + +} // namespace mlirtrt::compiler + +#endif // MLIR_TENSORRT_INIT_ALL_DIALECTS diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h similarity index 81% rename from mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h rename to mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h index e1fec31a0..8668ca659 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/InitLLVMExtensions.h +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllExtensions.h @@ -18,12 +18,14 @@ // //===----------------------------------------------------------------------===// /// -/// Registration methods for ConvertToLLVMPatternInterface dialect extensions. +/// Registration methods for dialect extensions. /// //===----------------------------------------------------------------------===// #ifndef MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS #define MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS +#include "mlir-tensorrt/Backends/Host/HostBackend.h" +#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h" #include "mlir-tensorrt/Conversion/CUDAToLLVM/CUDAToLLVM.h" #include "mlir-tensorrt/Conversion/PlanToLLVM/PlanToLLVM.h" #include "mlir-tensorrt/Conversion/TensorRTRuntimeToLLVM/TensorRTRuntimeToLLVM.h" @@ -37,28 +39,33 @@ #include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h" #include "mlir/Conversion/UBToLLVM/UBToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" +#include "mlir/Dialect/Func/Extensions/InlinerExtension.h" -namespace mlirtrt { +namespace mlirtrt::compiler { -/// Register all ConvertToLLVMPatternInterface dialect extensions. -inline void registerConvertToLLVMExtensions(mlir::DialectRegistry ®istry) { - // Upstream interfaces. +inline void registerAllExtensions(mlir::DialectRegistry ®istry) { + // Register all conversion to LLVM interfaces. mlir::arith::registerConvertArithToLLVMInterface(registry); - mlir::registerConvertComplexToLLVMInterface(registry); mlir::cf::registerConvertControlFlowToLLVMInterface(registry); + mlir::index::registerConvertIndexToLLVMInterface(registry); + mlir::registerConvertComplexToLLVMInterface(registry); + mlir::registerConvertCUDAToLLVMPatternInterface(registry); mlir::registerConvertFuncToLLVMInterface(registry); mlir::registerConvertMathToLLVMInterface(registry); mlir::registerConvertMemRefToLLVMInterface(registry); mlir::registerConvertNVVMToLLVMInterface(registry); + mlir::registerConvertPlanToLLVMPatternInterface(registry); + mlir::registerConvertTensorRTRuntimeToLLVMPatternInterface(registry); mlir::ub::registerConvertUBToLLVMInterface(registry); - mlir::index::registerConvertIndexToLLVMInterface(registry); mlir::vector::registerConvertVectorToLLVMInterface(registry); - // MLIR-TRT interfaces. - mlir::registerConvertPlanToLLVMPatternInterface(registry); - mlir::registerConvertTensorRTRuntimeToLLVMPatternInterface(registry); - mlir::registerConvertCUDAToLLVMPatternInterface(registry); + // Inliner extensions. + mlir::func::registerInlinerExtension(registry); + + // Plan Extensions. + mlir::plan::registerHostBackend(registry); + mlir::plan::registerTensorRTBackend(registry); } -} // namespace mlirtrt +} // namespace mlirtrt::compiler #endif // MLIR_TENSORRT_REGISTRATION_INITLLVMEXTENSIONS diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h similarity index 68% rename from mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h rename to mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h index a984073b1..8f2d088d4 100644 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h +++ b/mlir-tensorrt/compiler/include/mlir-tensorrt/InitAllPasses.h @@ -22,17 +22,20 @@ #ifndef REGISTRATION_REGISTERMLIRTENSORRTPASSES_H #define REGISTRATION_REGISTERMLIRTENSORRTPASSES_H +#include "mlir-executor/InitAllPasses.h" #include "mlir-tensorrt-dialect/TensorRT/Transforms/Passes.h" #include "mlir-tensorrt/Conversion/Passes.h" +#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h" +#include "mlir-tensorrt/Features.h" #include "mlir-tensorrt/Transforms/Passes.h" #include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" #include "mlir/Dialect/EmitC/Transforms/Passes.h" #include "mlir/Transforms/Passes.h" #ifdef MLIR_TRT_ENABLE_HLO #include "mlir-tensorrt/Compiler/StablehloToExecutable/Passes.h" #include "mlir-tensorrt/Compiler/StablehloToExecutable/StablehloToExecutable.h" -#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h" #include "mlir-tensorrt/Dialect/StablehloExt/Transforms/Passes.h" #include "stablehlo/transforms/Passes.h" #include "stablehlo/transforms/optimization/Passes.h" @@ -42,47 +45,36 @@ #include "mlir-tensorrt/Compiler/TensorRTToExecutable/Passes.h" #endif // MLIR_TRT_TARGET_TENSORRT -#ifdef MLIR_TRT_ENABLE_EXECUTOR -#include "mlir-executor/InitAllPasses.h" -#include "mlir/Dialect/Bufferization/Transforms/Passes.h" -#endif // MLIR_TRT_ENABLE_EXECUTOR - -namespace mlir { -namespace tensorrt { +namespace mlirtrt::compiler { /// Register passes declared within this repo. -inline void registerAllMlirTensorRtPasses() { - registerMLIRTensorRTConversionPasses(); - registerTensorRTPasses(); - registerMLIRTensorRTGenericTransformsPasses(); - mlir::registerTransformsPasses(); - mlir::registerConvertPDLToPDLInterp(); +inline void registerAllPasses() { mlir::emitc::registerEmitCPasses(); + mlir::plan::registerPlanDialectPipelines(); + mlir::plan::registerPlanPasses(); mlir::registerConvertAffineToStandard(); + mlir::registerConvertPDLToPDLInterp(); + mlir::registerMLIRTensorRTConversionPasses(); + mlir::registerMLIRTensorRTGenericTransformsPasses(); + mlir::registerTransformsPasses(); + mlir::tensorrt::registerTensorRTPasses(); + mlir::registerConvertCUDAToExecutorPass(); + mlir::bufferization::registerBufferizationPasses(); + mlir::executor::registerAllPasses(); -#ifdef MLIR_TRT_ENABLE_HLO - mlirtrt::compiler::registerStablehloToExecutablePasses(); - mlirtrt::compiler::registerStablehloToExecutablePipelines(); - mlirtrt::compiler::registerStableHloInputPipelines(); - stablehlo_ext::registerStableHloExtPasses(); - stablehlo::registerPasses(); - stablehlo::registerOptimizationPasses(); - plan::registerPlanPasses(); - plan::registerPlanDialectPipelines(); -#endif // MLIR_TRT_ENABLE_HLO - -#ifdef MLIR_TRT_TARGET_TENSORRT - mlirtrt::compiler::registerTensorRTToExecutablePipelines(); -#endif // MLIR_TRT_TARGET_TENSORRT + IF_MLIR_TRT_ENABLE_HLO({ + mlirtrt::compiler::registerStablehloToExecutablePasses(); + mlirtrt::compiler::registerStablehloToExecutablePipelines(); + mlirtrt::compiler::registerStableHloInputPipelines(); + mlir::stablehlo_ext::registerStableHloExtPasses(); + mlir::stablehlo::registerPasses(); + mlir::stablehlo::registerOptimizationPasses(); + }); -#ifdef MLIR_TRT_ENABLE_EXECUTOR - registerConvertCUDAToExecutorPass(); - bufferization::registerBufferizationPasses(); - executor::registerAllPasses(); -#endif // MLIR_TRT_ENABLE_EXECUTOR + IF_MLIR_TRT_TARGET_TENSORRT( + { mlirtrt::compiler::registerTensorRTToExecutablePipelines(); }); } -} // namespace tensorrt -} // namespace mlir +} // namespace mlirtrt::compiler #endif // REGISTRATION_REGISTERMLIRTENSORRTPASSES_H diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h deleted file mode 100644 index 6cbaa10ea..000000000 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h +++ /dev/null @@ -1,77 +0,0 @@ -//===- RegisterMlirTensorRtCoreDialects.h -----------------------*- C++ -*-===// -// -// SPDX-FileCopyrightText: Copyright 2024 NVIDIA CORPORATION & AFFILIATES. -// All rights reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// Registration methods for the core dialects defined by this project. -/// -//===----------------------------------------------------------------------===// -#include "mlir-tensorrt-dialect/Interface/TensorKindOpInterface.h" -#include "mlir-tensorrt-dialect/TensorRT/IR/TensorRTDialect.h" -#include "mlir-tensorrt-dialect/TensorRT/Target/TensorRTEncodingImpl.h" -#include "mlir-tensorrt/Backends/Host/HostBackend.h" -#include "mlir-tensorrt/Backends/TensorRT/TensorRTBackend.h" -#include "mlir/Dialect/Func/Extensions/InlinerExtension.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#ifdef MLIR_TRT_ENABLE_HLO -#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h" -#endif // MLIR_TRT_ENABLE_HLO -#ifdef MLIR_TRT_ENABLE_EXECUTOR -#include "mlir-executor/Executor/IR/Executor.h" -#include "mlir-tensorrt/Dialect/CUDA/IR/CUDADialect.h" -#include "mlir-tensorrt/Dialect/CUDA/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir-tensorrt/Dialect/TensorRTRuntime/IR/TensorRTRuntime.h" -#include "mlir-tensorrt/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/DLTI/DLTI.h" -#endif // MLIR_TRT_ENABLE_EXECUTOR - -namespace mlir { -class DialectRegistry; - -/// Register core MLIR-TensorRT project dialects (dialects defined by this -/// project and any of their immediate dependencies. -inline void registerCoreMlirTensorRtDialects(DialectRegistry ®istry) { - registry.insert(); - tensorrt::registerTensorRTEncodingOpInterfaceExternalModels(registry); - tensorrt::registerTensorKindOpInterfaceExternalModels(registry); - func::registerInlinerExtension(registry); - -#ifdef MLIR_TRT_ENABLE_EXECUTOR - registry.insert(); -#endif // MLIR_TRT_ENABLE_EXECUTOR - -#ifdef MLIR_TRT_ENABLE_HLO - registry.insert(); - mlir::plan::registerHostBackend(registry); - mlir::plan::registerTensorRTBackend(registry); -#endif // MLIR_TRT_ENABLE_HLO -} - -inline void -registerMlirTensorRtBufferizationInterfaces(DialectRegistry ®istry) { -#ifdef MLIR_TRT_ENABLE_EXECUTOR - trtrt::registerBufferizableOpInterfaceExternalModels(registry); - cuda::registerBufferizableOpInterfaceExternalModels(registry); -#endif // MLIR_TRT_ENABLE_EXECUTOR -} - -inline void registerMlirTensorRtTransformExtensions(DialectRegistry ®istry) { -} - -} // namespace mlir diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h b/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h deleted file mode 100644 index 1647a9f5d..000000000 --- a/mlir-tensorrt/compiler/include/mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h +++ /dev/null @@ -1,119 +0,0 @@ -//===- RegisterMlirTensorRtDialects.h ---------------------------*- C++ -*-===// -// -// SPDX-FileCopyrightText: Copyright 2024-2025 NVIDIA CORPORATION & AFFILIATES. -// All rights reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -// Register all dialects required by parts of this project, including dialects -// required by transformations or that are accepted by inputs. -//===----------------------------------------------------------------------===// -#ifndef MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H -#define MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H - -#include "mlir-tensorrt/Registration/RegisterMlirTensorRtCoreDialects.h" -#include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/EmitC/IR/EmitC.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" -#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h" -#include "mlir/Dialect/PDL/IR/PDL.h" -#include "mlir/Dialect/Quant/IR/Quant.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Shape/IR/Shape.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Transform/IR/TransformDialect.h" - -#ifdef MLIR_TRT_ENABLE_HLO -#include "mlir-tensorrt/Dialect/StablehloExt/IR/StableHloExt.h" -#include "stablehlo/dialect/ChloOps.h" -#include "stablehlo/dialect/StablehloOps.h" -#include "stablehlo/dialect/VhloOps.h" -#endif - -#ifdef MLIR_TRT_ENABLE_EXECUTOR -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h" -#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h" -#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" -#include "mlir/Dialect/Math/IR/Math.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h" -#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h" -#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h" -#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h" -#include "mlir/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.h" -#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h" -#include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h" -#endif - -namespace mlir { - -inline void registerAllMlirTensorRtExecutorDialects(DialectRegistry ®istry) { - // Registration for executor dialect and all upstream dialects that can appear - // in the host IR. - registry.insert(); - affine::registerValueBoundsOpInterfaceExternalModels(registry); - arith::registerBufferDeallocationOpInterfaceExternalModels(registry); - arith::registerBufferizableOpInterfaceExternalModels(registry); - bufferization::func_ext::registerBufferizableOpInterfaceExternalModels( - registry); - linalg::registerBufferizableOpInterfaceExternalModels(registry); - memref::registerAllocationOpInterfaceExternalModels(registry); - scf::registerBufferizableOpInterfaceExternalModels(registry); - tensor::registerBufferizableOpInterfaceExternalModels(registry); - tensor::registerInferTypeOpInterfaceExternalModels(registry); - tensor::registerSubsetOpInterfaceExternalModels(registry); - tensor::registerTilingInterfaceExternalModels(registry); - LLVM::registerInlinerInterface(registry); -} - -inline void registerAllMlirTensorRtDialects(DialectRegistry ®istry) { - registerCoreMlirTensorRtDialects(registry); - registerMlirTensorRtBufferizationInterfaces(registry); - registerMlirTensorRtTransformExtensions(registry); - - // Register other dialects declared in upstream or in dependencies. Only - // register dialects if absolutely necessary (i.e. they appear in the input - // IR). - registry.insert(); - -#ifdef MLIR_TRT_ENABLE_HLO - registry.insert(); - stablehlo::registerTensorKindOpInterfaceExternalModels(registry); - stablehlo::registerTypeInferenceExternalModels(registry); -#endif // MLIR_TRT_ENABLE_HLO - -#ifdef MLIR_TRT_ENABLE_EXECUTOR - registerAllMlirTensorRtExecutorDialects(registry); - tensor::registerValueBoundsOpInterfaceExternalModels(registry); -#endif // MLIR_TRT_ENABLE_EXECUTOR -} - -} // namespace mlir - -#endif // MLIR_TENSORRT_REGISTRATION_REGISTERMLIRTENSORRTDIALECTS_H diff --git a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt index 3035fcc58..c03ffe79b 100644 --- a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt +++ b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/CMakeLists.txt @@ -1,3 +1,5 @@ +get_property(MLIR_TENSORRT_LIBS GLOBAL PROPERTY MLIR_TENSORRT_LIBS) + add_mlir_tensorrt_public_c_api_library(MLIRTensorRTCAPIRegisterAllDialects RegisterAllDialects.cpp @@ -17,4 +19,5 @@ add_mlir_tensorrt_public_c_api_library(MLIRTensorRTCAPIRegisterAllDialects MLIRTensorRTTensorRTRuntimeTransforms MLIRTensorRTTensorRTToEmitC MLIRTransformDialect + ${MLIR_TENSORRT_LIBS} ) diff --git a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp index 6db864b23..9608ca4e0 100644 --- a/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp +++ b/mlir-tensorrt/compiler/lib/CAPI/Compiler/Registration/RegisterAllDialects.cpp @@ -25,19 +25,17 @@ #include "mlir-tensorrt-c/Compiler/Registration/RegisterAllDialects.h" #include "mlir-tensorrt/Compiler/StablehloToExecutable/StablehloToExecutable.h" #include "mlir-tensorrt/Compiler/TensorRTToExecutable/TensorRTToExecutable.h" -#include "mlir-tensorrt/Registration/InitLLVMExtensions.h" -#include "mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h" -#include "mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h" +#include "mlir-tensorrt/InitAllDialects.h" +#include "mlir-tensorrt/InitAllExtensions.h" +#include "mlir-tensorrt/InitAllPasses.h" #include "mlir/CAPI/IR.h" void mtrtCompilerRegisterDialects(MlirDialectRegistry registry) { - mlir::registerAllMlirTensorRtDialects(*unwrap(registry)); - mlirtrt::registerConvertToLLVMExtensions(*unwrap(registry)); + mlirtrt::compiler::registerAllDialects(*unwrap(registry)); + mlirtrt::compiler::registerAllExtensions(*unwrap(registry)); } -void mtrtCompilerRegisterPasses() { - mlir::tensorrt::registerAllMlirTensorRtPasses(); -} +void mtrtCompilerRegisterPasses() { mlirtrt::compiler::registerAllPasses(); } void mtrtCompilerRegisterTasks() { mlirtrt::compiler::registerStableHloToExecutableTask(); diff --git a/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp b/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp index 734367408..497970259 100644 --- a/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp +++ b/mlir-tensorrt/compiler/lib/Compiler/OptionsProviders.cpp @@ -84,16 +84,23 @@ void DebugOptions::applyToPassManager(PassManager &pm) const { if (!shouldPrintBeforePass && !shouldPrintAfterPass) return; + OpPrintingFlags printFlags{}; + if (this->elideElementsAttrIfLarger > 0) + printFlags.elideLargeElementsAttrs(this->elideElementsAttrIfLarger); + if (this->elideResourceStringsIfLarger > 0) + printFlags.elideLargeResourceString(this->elideResourceStringsIfLarger); + // Otherwise, add the IR printing instrumentation. if (!printTreeDir.empty()) { pm.enableIRPrintingToFileTree(shouldPrintBeforePass, shouldPrintAfterPass, printModuleScope, printAfterChange, - printAfterFailure, printTreeDir); + printAfterFailure, printTreeDir, printFlags); return; } + pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass, printModuleScope, printAfterChange, printAfterFailure, - llvm::errs()); + llvm::errs(), printFlags); } //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt index 62185af6c..8c741f648 100644 --- a/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt +++ b/mlir-tensorrt/compiler/lib/Conversion/CMakeLists.txt @@ -7,18 +7,15 @@ if(MLIR_TRT_ENABLE_HLO) add_subdirectory(ChloToStablehloExt) endif() -if(MLIR_TRT_ENABLE_EXECUTOR) - add_subdirectory(MemRefToCUDA) - add_subdirectory(TensorRTToTensorRTRuntime) - add_subdirectory(CUDAToExecutor) - add_subdirectory(PlanToExecutor) - add_subdirectory(TensorRTRuntimeToExecutor) -endif() - +add_subdirectory(CUDAToExecutor) add_subdirectory(CUDAToLLVM) add_subdirectory(HostToEmitC) add_subdirectory(HostToLLVM) add_subdirectory(LLVMCommon) +add_subdirectory(MemRefToCUDA) +add_subdirectory(PlanToExecutor) add_subdirectory(PlanToLLVM) +add_subdirectory(TensorRTRuntimeToExecutor) add_subdirectory(TensorRTRuntimeToLLVM) add_subdirectory(TensorRTToEmitC) +add_subdirectory(TensorRTToTensorRTRuntime) diff --git a/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt index c17ac47a5..8f12204e7 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt +++ b/mlir-tensorrt/compiler/lib/Dialect/CMakeLists.txt @@ -1,10 +1,7 @@ +add_subdirectory(CUDA) +add_subdirectory(Plan) add_subdirectory(TensorRTRuntime) -if(MLIR_TRT_ENABLE_EXECUTOR) - add_subdirectory(CUDA) -endif() - if(MLIR_TRT_ENABLE_HLO) - add_subdirectory(Plan) add_subdirectory(StablehloExt) endif() diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp index 3675b1534..96098513c 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp +++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AllocTensors.cpp @@ -206,23 +206,20 @@ struct RewriteFromElements : public OpRewritePattern { assert(lattice && !lattice->getValue().isUninitialized()); TensorKindInfo placementInfo = lattice->getValue(); - MemorySpace originalMemorySpaceConstraint = MemorySpace::host_pinned; + std::optional originalMemorySpace{}; if (auto constraint = - dyn_cast_or_null(op.getType().getEncoding())) { - // A pre-specified 'device' constraint is not allowed. - if (constraint.getValue() != MemorySpace::host && - constraint.getValue() != MemorySpace::host_pinned) - return failure(); - originalMemorySpaceConstraint = constraint.getValue(); - } + dyn_cast_or_null(op.getType().getEncoding())) + originalMemorySpace = constraint.getValue(); // Create a host allocation and insert the elements. + MemorySpace memorySpace = MemorySpace::host_pinned; Value hostReplacement = createTensorFromElements( - rewriter, op.getLoc(), op.getType(), op.getElements(), - originalMemorySpaceConstraint); + rewriter, op.getLoc(), op.getType(), op.getElements(), memorySpace); Value hostReplacementCasted = rewriter.create(loc, originalType, hostReplacement); - if (placementInfo.isHostOnly()) { + bool canOptimizeHostReplacement = + !originalMemorySpace || (*originalMemorySpace == memorySpace); + if (placementInfo.isHostOnly() && canOptimizeHostReplacement) { rewriter.replaceOp(op, hostReplacementCasted); return success(); } @@ -242,14 +239,17 @@ struct RewriteFromElements : public OpRewritePattern { .getResult(); devReplacement = rewriter.create(loc, originalType, devReplacement); - rewriter.replaceOpUsesWithIf( - op, hostReplacementCasted, [&](OpOperand &use) { - return TensorKindAnalysis::getStaticOperandTensorKind(use) == - TensorKind::Host; - }); + + if (canOptimizeHostReplacement) + rewriter.replaceOpUsesWithIf( + op, hostReplacementCasted, [&](OpOperand &use) { + return TensorKindAnalysis::getStaticOperandTensorKind(use) == + TensorKind::Host; + }); rewriter.replaceOpUsesWithIf(op, devReplacement, [&](OpOperand &use) { - return TensorKindAnalysis::getStaticOperandTensorKind(use) != - TensorKind::Host; + return !canOptimizeHostReplacement || + TensorKindAnalysis::getStaticOperandTensorKind(use) != + TensorKind::Host; }); return success(); } @@ -810,6 +810,7 @@ static LogicalResult rewriteFuncToDestinationPassingStyle( // value. bufferization::TraversalConfig config; config.followEquivalentOnly = true; + config.followInPlaceOnly = true; config.alwaysIncludeLeaves = true; SetVector equivalentValues = state.findValueInReverseUseDefChain( &v, /*condition=*/ @@ -962,6 +963,10 @@ static LogicalResult enforceFunctionCallingStylePolicy( return failure(); for (func::FuncOp func : orderedFuncOps) { + LLVM_DEBUG(DBGS() << "encountered func " << func.getName() << "\n"); + if (func.isDeclaration()) + continue; + // All functions should be single-block at this point. if (func.getBlocks().size() != 1) return failure(); diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp new file mode 100644 index 000000000..b20e39c77 --- /dev/null +++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/AssignMemorySpaces.cpp @@ -0,0 +1,399 @@ +//===- AssignMemorySpaces.cpp ---------------------------------------------===// +// +// SPDX-FileCopyrightText: Copyright 2024-2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of the `plan-assign-memory-spaces` pass. +/// +//===----------------------------------------------------------------------===// +#include "mlir-tensorrt-dialect/Analysis/TensorKindAnalysis.h" +#include "mlir-tensorrt/Dialect/Plan/IR/Plan.h" +#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h" +#include "mlir-tensorrt/Utils/ModuleUtils.h" +#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/AsmState.h" +#include "mlir/IR/DialectResourceBlobManager.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "plan-assign-memory-spaces" + +namespace mlir::plan { +#define GEN_PASS_DEF_PLANASSIGNMEMORYSPACESPASS +#include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h.inc" +} // namespace mlir::plan + +using namespace mlir; +using namespace mlir::plan; + +namespace { + +// Generic pattern that rewrites any op by rewriting its operands and result +// types. Regions are also rewritten. +class GenericConvertSpace : public ConversionPattern { +public: + GenericConvertSpace(TypeConverter &typeConverter, MLIRContext *context) + : ConversionPattern(typeConverter, MatchAnyOpTypeTag{}, 0, context) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + SmallVector resultTypes; + if (failed(typeConverter->convertTypes(op->getResultTypes(), resultTypes))) + return failure(); + + auto *newOp = Operation::create( + op->getLoc(), op->getName(), resultTypes, operands, op->getAttrs(), + op->getPropertiesStorage(), op->getSuccessors(), op->getNumRegions()); + for (auto regions : llvm::zip(op->getRegions(), newOp->getRegions())) { + Region &before = std::get<0>(regions); + Region &parent = std::get<1>(regions); + rewriter.inlineRegionBefore(before, parent, parent.end()); + if (failed(rewriter.convertRegionTypes(&parent, *typeConverter))) + return failure(); + } + rewriter.insert(newOp); + rewriter.replaceOp(op, newOp->getResults()); + return success(); + } +}; + +// A pattern that converts the type of the attribute used as an operand for +// arith.constant +class ConvertConstantPattern : public OpConversionPattern { +public: + ConvertConstantPattern(TypeConverter &typeConverter, MLIRContext *context) + : OpConversionPattern(typeConverter, context) {} + + LogicalResult + matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto newType = dyn_cast_if_present( + typeConverter->convertType(op.getType())); + if (!newType) + return failure(); + + ElementsAttr newAttr{}; + if (auto elementsAttr = dyn_cast(op.getValue())) + newAttr = elementsAttr.reshape(newType); + if (auto resourceAttr = + dyn_cast(op.getValue())) { + DenseResourceElementsHandle handle = resourceAttr.getRawHandle(); + newAttr = DenseResourceElementsAttr::get(newType, handle); + } + if (!newAttr) + return failure(); + rewriter.replaceOpWithNewOp(op, newAttr); + return success(); + } +}; +} // namespace + +/// Return true if the op is likely in a compute region, like the region of +/// `stablehlo.reduce` or `linalg.generic`. +static bool inComputeRegion(Operation *op) { + Operation *parent = op->getParentOp(); + while (parent) { + if (isa(parent)) + return false; + if (!isa(parent)) + return true; + parent = parent->getParentOp(); + } + return false; +} + +namespace { +/// Use an explicit 'host_pinned' staging tensor to materialie the +/// 'from_elements' before creating explicitly moving it to the 'device' space. +/// Other optimization patterns below help avoid the host-device transfer when +/// possible. +struct FixUpFromElements : public OpRewritePattern { + FixUpFromElements(MLIRContext *ctx, const DataFlowSolver &solver, + PatternBenefit benefit = 1) + : OpRewritePattern(ctx, benefit), solver(solver) {} + + LogicalResult matchAndRewrite(tensor::FromElementsOp op, + PatternRewriter &rewriter) const override { + auto space = dyn_cast_or_null(op.getType().getEncoding()); + if (!space) + return failure(); + if (space.getValue() != plan::MemorySpace::device) + return failure(); + + const TensorKindLattice *lattice = + solver.lookupState(op.getResult()); + if (!lattice || lattice->getValue().isUninitialized() || + !lattice->getValue().isHostVisible()) + return failure(); + + RankedTensorType originalType = op.getType(); + RankedTensorType newType = RankedTensorType::get( + originalType.getShape(), originalType.getElementType(), + MemorySpaceAttr::get(originalType.getContext(), + plan::MemorySpace::host_pinned)); + auto newOp = rewriter.create(op.getLoc(), newType, + op.getElements()); + Value deviceTensor = rewriter.create( + op.getLoc(), originalType.getShape(), originalType.getElementType(), + originalType.getEncoding()); + Value rematDevReplacement = + rewriter + .create( + op.getLoc(), originalType, newOp.getResult(), deviceTensor) + .getResult(); + rewriter.replaceOp(op, rematDevReplacement); + return success(); + } + + const DataFlowSolver &solver; +}; + +static bool isHostVisible(TypedValue v) { + auto space = dyn_cast_or_null(v.getType().getEncoding()); + if (!space) + return false; + switch (space.getValue()) { + case plan::MemorySpace::host: + case plan::MemorySpace::host_pinned: + case plan::MemorySpace::unified: + return true; + default: + return false; + } +} + +/// For any 'shape' parameter of a 'tensor.reshape', get the shape by skipping +/// past any unnecessary explicit host-device transfers. +struct ReshapeAbsorbDeviceCast : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(tensor::ReshapeOp op, + PatternRewriter &rewriter) const override { + if (isHostVisible(op.getShape())) + return failure(); + auto matOp = + op.getShape() + .getDefiningOp(); + if (!matOp) + return failure(); + auto source = dyn_cast>(matOp.getSource()); + if (!source || !isHostVisible(source)) + return failure(); + rewriter.modifyOpInPlace(op, + [&]() { op.getShapeMutable().assign(source); }); + return success(); + } +}; + +/// Rewrite `memref.load` that acts on device memory to first copy the buffer to +/// the host and load from the host buffer. +struct TensorDeviceExtractRewriter + : public OpRewritePattern { + + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExtractOp op, + PatternRewriter &rewriter) const override { + auto source = op.getTensor(); + if (isHostVisible(source)) + return failure(); + + if (inComputeRegion(op)) + return failure(); + + rewriter.setInsertionPointAfterValue(source); + Value hostTensor = rewriter.create( + op.getLoc(), + RankedTensorType::get( + source.getType().getShape(), source.getType().getElementType(), + plan::MemorySpaceAttr::get(op->getContext(), + plan::MemorySpace::host_pinned)), + source); + + rewriter.replaceUsesWithIf(op.getTensor(), hostTensor, [&](OpOperand &use) { + return isa(use.getOwner()); + }); + + return success(); + } +}; + +/// Remap relevant analysis state of type T from `original` to `replacement`. +template +static void remapLatticeState(DataFlowSolver &solver, Value original, + Value replacement) { + if constexpr (!std::is_same_v) { + if (const T *lattice = solver.lookupState(original)) { + T *latticeReplacement = solver.getOrCreateState(replacement); + latticeReplacement->getValue() = lattice->getValue(); + } + } else { + // do nothing for liveness analysis for the moment except create the state + if (const auto *oldState = + solver.lookupState(original)) { + dataflow::Executable *newState = solver.getOrCreateState(replacement); + // Set to live if old state is live. We ignore change status. + if (oldState->isLive()) + (void)newState->setToLive(); + } + } +} + +/// A rewrite listener that transfers replacements to updates to the solver +/// state. +class SolverStateListener : public RewriterBase::Listener { +public: + SolverStateListener(DataFlowSolver &solver) + : RewriterBase::Listener(), solver(solver) {} + +private: + void notifyOperationReplaced(Operation *op, + ValueRange replacements) override { + for (auto [original, replacement] : + llvm::zip_equal(op->getResults(), replacements)) { + remapLatticeState(solver, original, replacement); + remapLatticeState>( + solver, original, replacement); + remapLatticeState(solver, original, replacement); + } + solver.eraseState(solver.getProgramPointAfter(op)); + } + void notifyOperationReplaced(Operation *op, Operation *replacement) override { + notifyOperationReplaced(op, replacement->getResults()); + } + + void notifyOperationErased(Operation *op) override { + solver.eraseState(solver.getProgramPointAfter(op)); + for (Value res : op->getResults()) + solver.eraseState(res); + } + + DataFlowSolver &solver; +}; + +} // namespace + +namespace { +struct AssignMemorySpacesPass + : public plan::impl::PlanAssignMemorySpacesPassBase< + AssignMemorySpacesPass> { + void runOnOperation() override { + + MLIRContext *context = &getContext(); + ConversionTarget target(*context); + + TypeConverter converter; + converter.addConversion( + [&](Type type) -> std::optional { return type; }); + + // The default tensor type converter just adds the 'device' memory type + // info. + auto deviceEncoding = + plan::MemorySpaceAttr::get(context, plan::MemorySpace::device); + converter.addConversion([&](RankedTensorType type) -> std::optional { + if (type.getEncoding()) + return type; + return RankedTensorType::get(type.getShape(), type.getElementType(), + deviceEncoding); + }); + + // Ops are legal if they are in a nested module or if their operand and + // result types are legal. + target.markUnknownOpDynamicallyLegal([&](Operation *op) { + if (op->getParentWithTrait() != getOperation()) + return true; + return converter.isLegal(op->getOperandTypes()) && + converter.isLegal(op->getResultTypes()); + }); + target.addDynamicallyLegalOp([&](func::FuncOp op) { + if (op->getParentWithTrait() != getOperation()) + return true; + return converter.isSignatureLegal(op.getFunctionType()); + }); + target.markOpRecursivelyLegal( + [&](func::FuncOp op) -> std::optional { + if (op->getParentWithTrait() != getOperation()) + return true; + return false; + }); + target.addDynamicallyLegalOp([&](arith::ConstantOp op) { + if (op->getParentWithTrait() != getOperation()) + return true; + return converter.isLegal(op.getType()) && + converter.isLegal(op.getValue().getType()); + }); + + RewritePatternSet patterns(&getContext()); + patterns.add(converter, + context); + + // FuncOp is special as it has type encoding via attributes. + populateFunctionOpInterfaceTypeConversionPattern(patterns, + converter); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + + auto module = getOperation(); + if (failed(applyFullConversion(module, target, std::move(patterns)))) { + emitError(module.getLoc(), "failed to assign memory spaces"); + return signalPassFailure(); + } + + // Perform some minor optimizations involving tensor.from_elements. + { + SymbolTableCollection symbolTables; + DataFlowSolver solver(DataFlowConfig().setInterprocedural(false)); + solver.load(); + solver.load(); + solver.load(symbolTables); + + if (failed(solver.initializeAndRun(getOperation()))) { + emitError(getOperation().getLoc()) + << "failed to run TensorKindAnalysis"; + return signalPassFailure(); + } + + SolverStateListener solverAwareListener(solver); + GreedyRewriteConfig config; + config.listener = &solverAwareListener; + FrozenRewritePatternSet patterns = [&]() { + RewritePatternSet patterns_(&getContext()); + patterns_.insert(&getContext(), solver); + patterns_.insert(&getContext()); + patterns_.insert(&getContext()); + return patterns_; + }(); + for (FunctionOpInterface func : + getOperation().getOps()) { + if (failed(applyPatternsGreedily(func, patterns))) { + emitError(func.getLoc()) << "failed to run " << getArgument(); + return signalPassFailure(); + } + } + } + } +}; +} // namespace diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt index a2953e40d..be4d4ffc3 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt +++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_tensorrt_library(MLIRTensorRTPlanTransforms AllocTensors.cpp + AssignMemorySpaces.cpp Bufferize.cpp Clustering.cpp CreateClosedRegions.cpp diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp index 34ff09d1e..2b19ca948 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp +++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/ModuleBufferization/ModuleBufferization.cpp @@ -29,6 +29,7 @@ #include "mlir/Dialect/Bufferization/Transforms/Transforms.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/DialectResourceBlobManager.h" #include "mlir/IR/SymbolTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" @@ -327,6 +328,36 @@ static LogicalResult insertTensorCopiesInModule( return insertTensorCopiesWithinModuleScope(module, state); } +/// The memref.global operation rejects encodings on the type of the +/// ElementsAttr. Drop them here. +/// TODO: fix upstream bufferization to handle this. +static void fixupMemrefGlobalInitialValueTypes(ModuleLikeOp moduleOp) { + for (memref::GlobalOp global : moduleOp.getOps()) { + ElementsAttr initialValue = + llvm::dyn_cast_or_null(global.getInitialValueAttr()); + if (!initialValue) + continue; + // Drop the encoding if present. + if (auto tensorType = dyn_cast(initialValue.getType())) { + if (auto encoding = tensorType.getEncoding()) { + tensorType = RankedTensorType::get(tensorType.getShape(), + tensorType.getElementType()); + if (auto elementsAttr = dyn_cast(initialValue)) { + initialValue = elementsAttr.reshape(tensorType); + global.setInitialValueAttr(initialValue); + continue; + } + if (auto resourceAttr = + dyn_cast(initialValue)) { + DenseResourceElementsHandle handle = resourceAttr.getRawHandle(); + initialValue = DenseResourceElementsAttr::get(tensorType, handle); + global.setInitialValueAttr(initialValue); + continue; + } + } + } + } +} static LogicalResult bufferizeOneModule(ModuleLikeOp moduleOp, const bufferization::OneShotBufferizationOptions &options, @@ -371,6 +402,10 @@ bufferizeOneModule(ModuleLikeOp moduleOp, return success(); if (failed(bufferizeOneModuleLikeOp(moduleOp, options, statistics))) return failure(); + + // Fixup any globals which have incorect encodings on the initial value type. + fixupMemrefGlobalInitialValueTypes(moduleOp); + return success(); } diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp index 6efa7f6d7..418833088 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp +++ b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp @@ -62,6 +62,7 @@ static void buildPlanOneShotBufferizePipelinePipeline( OpPassManager &pm, const plan::PlanAllocTensorsPassOptions &opts) { pm.addPass(createInlinerPass()); pm.addPass(bufferization::createEmptyTensorEliminationPass()); + pm.addPass(plan::createPlanAssignMemorySpacesPass()); pm.addPass(plan::createPlanAllocTensorsPass(opts)); pm.addPass(plan::createPlanModuleBufferizePass()); pm.addPass(mlir::createMemRefCastEliminationPass()); diff --git a/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp index 4adb541ee..c1a2d10d7 100644 --- a/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir-tensorrt/compiler/lib/Dialect/TensorRTRuntime/Transforms/BufferizableOpInterfaceImpl.cpp @@ -85,7 +85,7 @@ struct EnqueueOpInterface bool bufferizesToElementwiseAccess(Operation *op, const bufferization::AnalysisState &state, ArrayRef opOperands) const { - return true; + return false; } /// Bufferize the `trtrt.enqueue` operation. diff --git a/mlir-tensorrt/compiler/test/CMakeLists.txt b/mlir-tensorrt/compiler/test/CMakeLists.txt index 4f3c7294d..b0ec5e0ce 100644 --- a/mlir-tensorrt/compiler/test/CMakeLists.txt +++ b/mlir-tensorrt/compiler/test/CMakeLists.txt @@ -22,6 +22,7 @@ configure_lit_site_cfg( set(MLIR_TENSORRT_TEST_DEPENDS_ FileCheck count not mlir-tensorrt-opt + mlir-tensorrt-runner mlir-tensorrt-translate ${MLIR_TENSORRT_TEST_LIBS} ) @@ -32,10 +33,6 @@ if(MLIR_TRT_TARGET_TENSORRT AND TensorRTTestPlugins) endif() -if(MLIR_TRT_TARGET_LUA) - list(APPEND MLIR_TENSORRT_TEST_DEPENDS_ mlir-tensorrt-runner) -endif() - if(MLIR_TRT_ENABLE_PYTHON) list(APPEND MLIR_TENSORRT_TEST_DEPENDS_ diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir new file mode 100644 index 000000000..f87590198 --- /dev/null +++ b/mlir-tensorrt/compiler/test/Dialect/Plan/assign-memory-spaces.mlir @@ -0,0 +1,73 @@ +// RUN: mlir-tensorrt-opt %s -split-input-file --plan-assign-memory-spaces -canonicalize | FileCheck %s + + +func.func private @cond() -> i1 + +// CHECK-LABEL: func.func @scf_while_loop_2 +// CHECK: scf.while {{.*}}tensor<1xf32, #plan.memory_space>) -> tensor<1xf32, #plan.memory_space> +func.func @scf_while_loop_2(%arg0: f32) -> f32 { + %c0 = arith.constant 0 : index + %1 = tensor.from_elements %arg0 : tensor<1xf32> + %2 = scf.while (%arg1 = %1) : (tensor<1xf32>) -> tensor<1xf32> { + %cond = func.call @cond() : () -> i1 + %e = tensor.extract %arg1[%c0] : tensor<1xf32> + %f = arith.addf %e, %e : f32 + %3 = tensor.from_elements %f : tensor<1xf32> + scf.condition(%cond) %3 : tensor<1xf32> + } do { + ^bb0(%arg1: tensor<1xf32>): + %extract = tensor.extract %arg1[%c0] : tensor<1xf32> + %3 = arith.addf %extract, %extract : f32 + %4 = tensor.from_elements %3 : tensor<1xf32> + scf.yield %4 : tensor<1xf32> + } + %3 = tensor.extract %2[%c0] : tensor<1xf32> + return %3 : f32 +} + +// ----- + +// CHECK-LABEL: func.func @arith_constant +// CHECK: arith.constant {{.*}} : tensor<2xf32, #plan.memory_space> +// CHECK: arith.constant {{.*}} : tensor<2xf32, #plan.memory_space> +func.func @arith_constant() -> (tensor<2xf32>, tensor<2xf32>) { + %0 = arith.constant dense<[0.1, 0.2]> : tensor<2xf32> + %1 = arith.constant dense_resource<__elided__> : tensor<2xf32> + return %0, %1 : tensor<2xf32>, tensor<2xf32> +} + +// ----- + +// CHECK-LABEL: module @nested_module +// CHECK-NOT: #plan.memory_space +module @outer { +module @nested_module { + func.func @nested_func() -> tensor<2xf32> { + %0 = arith.constant dense<[0.1, 0.2]> : tensor<2xf32> + return %0 : tensor<2xf32> + } +} +} + +// ----- + +// CHECK-LABEL: func.func @existing_constraint_1 +// CHECK: tensor.extract {{.*}} +func.func @existing_constraint_1(%arg0: tensor<2xf32, #plan.memory_space>) -> f32 { + %c0 = arith.constant 0 : index + %0 = tensor.extract %arg0[%c0] : tensor<2xf32, #plan.memory_space> + return %0 : f32 +} + +// ----- + +// CHECK-LABEL: func.func @existing_constraint_2 +// CHECK-NOT: tensor.cast +// CHECK: tensor.extract {{.*}} +func.func @existing_constraint_2(%arg0: tensor<2xf32, #plan.memory_space>) -> f32 { + %c0 = arith.constant 0 : index + %1 = tensor.cast %arg0 : tensor<2xf32, #plan.memory_space> to tensor<2xf32> + %0 = tensor.extract %1[%c0] : tensor<2xf32> + return %0 : f32 +} + diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir index f3d137e88..e70fc0011 100644 --- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir +++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-alloc-tensors.mlir @@ -280,11 +280,11 @@ func.func @test_dps_chain_repeat(%arg0: tensor<10xf32>) -> (tensor<10xf32>, tens // CHECK-LABEL: @test_dps_chain_repeat // CHECK-SAME: (%[[arg0:.+]]: tensor<10xf32>, %[[arg1:.+]]: tensor<10xf32> {plan.result_arg}, %[[arg2:.+]]: tensor<10xf32> {plan.result_arg}, %[[arg3:.+]]: tensor<10xf32> {plan.result_arg}) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) // CHECK-NOT: bufferization.alloc_tensor() -// CHECK: %[[v0:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[arg1]] : tensor<10xf32>) +// CHECK: %[[v0:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[arg2]] : tensor<10xf32>) // CHECK: %[[v1:.+]] = linalg.generic {{.*}} ins(%[[arg0]] : tensor<10xf32>) outs(%[[v0]] : tensor<10xf32>) -// CHECK: %[[v2:.+]] = bufferization.materialize_in_destination %[[v0]] in %[[arg2]] : +// CHECK: %[[v2:.+]] = bufferization.materialize_in_destination %[[v1]] in %[[arg1]] : // CHECK-NEXT: %[[v3:.+]] = bufferization.materialize_in_destination %[[v0]] in %[[arg3]] : -// CHECK-NEXT: return %[[v1]], %[[v2]], %[[v3]] : tensor<10xf32>, tensor<10xf32>, tensor<10xf32> +// CHECK-NEXT: return %[[v2]], %[[v0]], %[[v3]] : tensor<10xf32>, tensor<10xf32>, tensor<10xf32> // CHECK-ALLOC-LABEL: @test_dps_chain_repeat // CHECK-ALLOC-SAME: (%[[arg0:.+]]: tensor<10xf32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) @@ -767,7 +767,7 @@ func.func @test_dps_complex_reshape_collapse_equivalent( // CHECK-LABEL: func.func @test_dps_complex_reshape_collapse_equivalent // CHECK-SAME: (%[[arg0:.+]]: tensor<2x3xcomplex, #plan.memory_space>, %[[arg1:.+]]: tensor<2x3xcomplex, #plan.memory_space>, %[[arg2:.+]]: tensor<6xcomplex, #plan.memory_space> {plan.result_arg}) // CHECK-DAG: %[[expanded:.+]] = tensor.expand_shape %[[arg2]] {{\[}}[0, 1]] output_shape [2, 3] : -// CHECK-DAG: %[[mapped:.+]] = linalg.map { complex.add } ins(%[[arg0]], %[[arg1]] : +// CHECK-DAG: %[[mapped:.+]] = linalg.map { complex.add } ins(%[[arg0]], %[[arg1]] : // CHECK-DAG: %[[collapsed:.+]] = tensor.collapse_shape %[[mapped]] // CHECK-DAG: return %[[collapsed]] @@ -847,8 +847,8 @@ func.func @test_dps_bitcast_not_equivalent( // CHECK-LABEL: func.func @test_dps_bitcast_not_equivalent // CHECK-SAME: (%[[arg0:.+]]: tensor<2xi32, #plan.memory_space>, %[[arg1:.+]]: tensor<2xi32, #plan.memory_space>, %[[arg2:.+]]: tensor<2xf32, #plan.memory_space> {plan.result_arg}) -// CHECK-DAG: %[[v0:.+]] = bufferization.alloc_tensor() +// CHECK-DAG: %[[v0:.+]] = bufferization.alloc_tensor() // CHECK-DAG: %[[mapped:.+]] = linalg.map {{.*}} ins(%[[arg0]], %[[arg1]] : {{.*}}) outs(%[[v0]] : {{.*}}) -// CHECK-DAG: %[[v1:.+]] = tensor.bitcast %[[mapped]] +// CHECK-DAG: %[[v1:.+]] = tensor.bitcast %[[mapped]] // CHECK-DAG: %[[v2:.+]] = bufferization.materialize_in_destination %[[v1]] in %[[arg2]] // CHECK-DAG: return %[[v2]] diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir index c206b6c71..baf0a8c8f 100644 --- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir +++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize-pipeline.mlir @@ -83,3 +83,47 @@ func.func @small_host_and_device_tensor_constant(%arg0: tensor) -> (ten // CHECK: memref.dealloc %[[alloc]] : memref<4xindex, #plan.memory_space> // CHECK: memref.dealloc %[[alloc_0]] : memref<4xindex, #plan.memory_space> // CHECK: return + +// ----- + +func.func private @cond() -> i1 + +// The test case illustrates a while loop that for whatever reason may not +// have been "detensorized" earlier in the pipeline. The TensorKindAnalysis +// will show that all tensors are "host-only", but currently bufferization +// does not deduce this via its memory space inference logic. Therefore, the +// loop will be bufferized so that the buffers are in the device +// space at branch points, which means lots of copies are inserted. Before +// adding the 'plan-assign-memory-spaces' pass, we would get a failure here +// due to mixed types of init arg and yielded value inferred by bufferization. +// In the future, we can optimize this case by adding support for rewriting +// the encoding attribute of loop-carried tensors to be host for this case. + +func.func @while_loop_host_tensor_carried(%arg0: f32) -> f32 { + %c0 = arith.constant 0 : index + %1 = tensor.from_elements %arg0 : tensor<1xf32> + %2 = scf.while (%arg1 = %1) : (tensor<1xf32>) -> tensor<1xf32> { + %cond = func.call @cond() : () -> i1 + %e = tensor.extract %arg1[%c0] : tensor<1xf32> + %f = arith.addf %e, %e : f32 + %3 = tensor.from_elements %f : tensor<1xf32> + scf.condition(%cond) %3 : tensor<1xf32> + } do { + ^bb0(%arg1: tensor<1xf32>): + %extract = tensor.extract %arg1[%c0] : tensor<1xf32> + %3 = arith.addf %extract, %extract : f32 + %4 = tensor.from_elements %3 : tensor<1xf32> + scf.yield %4 : tensor<1xf32> + } + %3 = tensor.extract %2[%c0] : tensor<1xf32> + return %3 : f32 +} + +// CHECK-LABEL: func.func @while_loop_host_tensor_carried +// CHECK: scf.while : () -> () +// CHECK-COUNT-2: memref.copy +// CHECK: scf.condition +// CHECK-COUNT-2: memref.copy +// CHECK: scf.yield +// CHECK-COUNT-1: memref.copy +// CHECK-NOT: memref.copy diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir index c44b7a68b..f623b7cba 100644 --- a/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir +++ b/mlir-tensorrt/compiler/test/Dialect/Plan/plan-bufferize.mlir @@ -43,28 +43,39 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<1xf32> { // CHECK: memref.global "private" constant @__constant_1xf32 // CHECK: memref.global "private" constant @__constant_1xi32 -// CHECK-LABEL: @main -// CHECK-SAME: (%[[arg0:.+]]: memref<10xf32, #plan.memory_space>) -> memref<1xf32, #plan.memory_space> -// CHECK: %[[v0:.+]] = memref.get_global @__constant_1xi32 -// CHECK: %[[v1:.+]] = memref.get_global @__constant_1xf32 -// CHECK: %[[v2:.+]] = cuda.stream.create : !cuda.stream -// CHECK: %[[alloc:.+]] = memref.alloc() -// CHECK: %[[v3:.+]] = trtrt.get_function @trt_while_loop_region -// CHECK: %[[v4:.+]] = trtrt.get_function @trt_while_loop_region_0 -// CHECK: %[[alloc_0:.+]] = memref.alloc() -// CHECK: memref.copy %[[v0]], %[[alloc_0]] -// CHECK: %[[alloc_1:.+]] = memref.alloc() -// CHECK: memref.copy %[[v1]], %[[alloc_1]] -// CHECK: %[[v5:.+]]:2 = scf.while (%[[arg1:.+]] = %[[alloc_0]], %[[arg2:.+]] = %[[alloc_1]]) -// CHECK: trtrt.enqueue %[[v3]] stream(%[[v2]]) (%[[arg1]]) outs(%[[alloc]]) -// CHECK: %[[alloc_2:.+]] = memref.alloc() {{.*}} : memref> -// CHECK: memref.copy %[[alloc]], %[[alloc_2]] : memref> to memref> -// CHECK: %[[v6:.+]] = memref.load %[[alloc_2]][] : memref> -// CHECK: scf.condition(%[[v6]]) %[[arg1]], %[[arg2]] +// CHECK-LABEL: func.func @main +// CHECK-SAME: (%[[arg0:.+]]: memref<10xf32, #plan.memory_space>) +// CHECK-DAG: %[[v0:.+]] = memref.get_global @__constant_1xi32 : memref<1xi32, #plan.memory_space> +// CHECK-DAG: %[[v1:.+]] = memref.get_global @__constant_1xf32 : memref<1xf32, #plan.memory_space> +// CHECK-DAG: %[[v2:.+]] = cuda.stream.create : !cuda.stream +// CHECK-DAG: %[[alloc:.+]] = memref.alloc() {alignment = 16 : i64} : memref> +// CHECK-DAG: %[[v3:.+]] = trtrt.get_function @trt_while_loop_region : !trtrt.context +// CHECK-DAG: %[[v4:.+]] = trtrt.get_function @trt_while_loop_region_0 : !trtrt.context +// CHECK: %[[alloc_0:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[v0]], %[[alloc_0]] +// CHECK-DAG: %[[alloc_1:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[v1]], %[[alloc_1]] +// CHECK: %[[v5:.+]]:2 = scf.while (%[[arg1:.+]] = %[[alloc_0]], %[[arg2:.+]] = %[[alloc_1]]) : ({{.*}}) -> +// CHECK-DAG: trtrt.enqueue %[[v3]] stream(%[[v2]]) (%[[arg1]]) outs(%[[alloc]]) +// CHECK-DAG: %[[alloc_2:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[alloc]], %[[alloc_2]] +// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[v6:.+]] = memref.load %[[alloc_2]][] +// CHECK-DAG: %[[alloc_3:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[arg1]], %[[alloc_3]] +// CHECK-DAG: %[[alloc_4:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[alloc_3]], %[[alloc_4]] +// CHECK-DAG: %[[alloc_5:.+]] = memref.alloc() +// CHECK-DAG: memref.copy %[[arg2]], %[[alloc_5]] +// CHECK-DAG: %[[alloc_6:.+]] = memref.alloc() {alignment = 16 : i64} : memref<1xf32, #plan.memory_space> +// CHECK-DAG: memref.copy %[[alloc_5]], %[[alloc_6]] : +// CHECK: scf.condition(%[[v6]]) %[[alloc_4]], %[[alloc_6]] : // CHECK: } do { -// CHECK: ^bb0(%[[arg1:.+]]: memref<1xi32, #plan.memory_space>, %[[arg2:.+]]: memref<1xf32, #plan.memory_space>): -// CHECK: trtrt.enqueue %[[v4]] stream(%[[v2]]) (%[[arg0]], %[[arg1]], %[[arg2]]) outs(%[[arg1]], %[[arg2]]) -// CHECK: scf.yield %[[arg1]], %[[arg2]] +// CHECK: ^bb0(%[[arg1:.+]]: memref<1xi32, #plan.memory_space>, %[[arg2:.+]]: memref< +// CHECK-DAG: %[[alloc_2:.+]] = memref.alloc() +// CHECK-DAG: %[[alloc_3:.+]] = memref.alloc() +// CHECK-DAG: trtrt.enqueue %[[v4]] stream(%[[v2]]) (%[[arg0]], %[[arg1]], %[[arg2]]) outs(%[[alloc_2]], %[[alloc_3]]) : +// CHECK: scf.yield %[[alloc_2]], %[[alloc_3]] : // CHECK: cuda.stream.sync %[[v2]] // CHECK: return %[[v5]]#1 diff --git a/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir b/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir index d5e579d48..13f783fa5 100644 --- a/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir +++ b/mlir-tensorrt/compiler/test/Dialect/TensorRTRuntime/one-shot-bufferize.mlir @@ -15,6 +15,22 @@ func.func @enqueue_simple( // ----- +func.func @enqueue_alias( + %ctx: !trtrt.context, %stream: !cuda.stream, + %arg0: tensor<1x3x256x256xf32>) -> tensor<1x3x256x256xf32> { + %0 = tensor.empty() : tensor<1x3x256x256xf32> + %3 = trtrt.enqueue %ctx stream(%stream) (%arg0) outs(%arg0) : (tensor<1x3x256x256xf32>) -> tensor<1x3x256x256xf32> + return %3 : tensor<1x3x256x256xf32> +} + +// CHECK-LABEL: func.func @enqueue_alias +// CHECK-SAME: (%[[arg0:.+]]: !trtrt.context, %[[arg1:.+]]: !cuda.stream, %[[arg2:.+]]: memref< +// CHECK: %[[alloc:.+]] = memref.alloc() +// CHECK: trtrt.enqueue %[[arg0]] stream(%[[arg1]]) (%[[arg2]]) outs(%[[alloc]]) +// CHECK: return %[[alloc]] : + +// ----- + func.func @enqueue_host_tensors_space_check( %ctx: !trtrt.context, %stream: !cuda.stream, %arg0: tensor<4xi32>, %arg1: tensor<128xf32>) -> tensor<128xf32> { diff --git a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg index 79a2f2478..6d34f316c 100644 --- a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg +++ b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/lit.local.cfg @@ -1,7 +1,4 @@ if not config.target_tensorrt: config.unsupported = True -if not config.target_lua: - config.unsupported = True if not "host-has-at-least-1-gpus" in config.available_features: config.unsupported = True -config.parallelism_group = "non-collective" diff --git a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg deleted file mode 100644 index 8e5a835a9..000000000 --- a/mlir-tensorrt/compiler/test/Target/Lua/IntegrationTests/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if (not config.target_lua or - not config.target_lua): - config.unsupported = True diff --git a/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg b/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg deleted file mode 100644 index feaf6adf8..000000000 --- a/mlir-tensorrt/compiler/test/Target/Lua/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not config.target_lua: - config.unsupported = True diff --git a/mlir-tensorrt/compiler/test/lit.cfg.py b/mlir-tensorrt/compiler/test/lit.cfg.py index f94a170c1..56504d3c9 100644 --- a/mlir-tensorrt/compiler/test/lit.cfg.py +++ b/mlir-tensorrt/compiler/test/lit.cfg.py @@ -6,12 +6,13 @@ import sys from pathlib import Path -import lit.formats -import lit.util +from lit.LitConfig import LitConfig from lit.llvm import llvm_config from lit.llvm.subst import ToolSubst -from lit.LitConfig import LitConfig from lit.TestingConfig import TestingConfig +import lit.formats +import lit.util +import psutil config: TestingConfig = config # type: ignore lit_config: LitConfig = lit_config # type: ignore @@ -46,10 +47,21 @@ def load_gpu_tools_module(): gpu_tools = load_gpu_tools_module() -def estimate_paralllelism(mem_required: float) -> int: +def estimate_paralllelism( + gb_gpu_mem_required: float, gb_sys_mem_required: float +) -> int: try: + parallelism = 2 with gpu_tools.nvml_context() as devices: - return gpu_tools.estimate_parallelism_from_memory(devices, mem_required) + parallelism = gpu_tools.estimate_parallelism_from_memory( + devices, gb_gpu_mem_required + ) + return int( + min( + parallelism, + (psutil.virtual_memory().available / (1024**3)) // gb_sys_mem_required, + ) + ) except: return 2 @@ -69,11 +81,17 @@ def estimate_paralllelism(mem_required: float) -> int: ) config.substitutions.append(("%trt_lib_dir", config.tensorrt_lib_dir)) -# Setup the parallelism groups. -lit_config.parallelism_groups["non-collective"] = estimate_paralllelism(2.0) -lit_config.parallelism_groups["collective"] = 1 -lit_config.parallelism_groups["models"] = estimate_paralllelism(8.0) -lit_config.parallelism_group = None +# Setup the parallelism groups. Note that just instantiating the TRT builder +# requires ~2.5 GB of system memory, so we use 3.0 as a baseline limit. +lit_config.parallelism_groups["default"] = estimate_paralllelism( + 2.0, gb_sys_mem_required=3.0 +) +lit_config.parallelism_groups["models"] = estimate_paralllelism( + 8.0, gb_sys_mem_required=4.0 +) +lit_config.parallelism_groups["heavy"] = 1 + +lit_config.parallelism_group = "default" print(f"Parallelism Groups: {lit_config.parallelism_groups}", file=sys.stderr) diff --git a/mlir-tensorrt/compiler/test/lit.site.cfg.py.in b/mlir-tensorrt/compiler/test/lit.site.cfg.py.in index d8a048b00..c8fdafcd9 100644 --- a/mlir-tensorrt/compiler/test/lit.site.cfg.py.in +++ b/mlir-tensorrt/compiler/test/lit.site.cfg.py.in @@ -24,10 +24,7 @@ config.python_executable = "@Python3_EXECUTABLE@" config.enable_hlo = @MLIR_TRT_ENABLE_HLO@ config.target_tensorrt = @MLIR_TRT_TARGET_TENSORRT@ config.mlir_tensorrt_compile_time_version = "@MLIR_TRT_TENSORRT_VERSION@" -config.target_lua = @MLIR_TRT_TARGET_LUA@ config.enable_nccl = @MLIR_TRT_ENABLE_NCCL@ -config.target_lua = @MLIR_TRT_TARGET_LUA@ -config.enable_executor = @MLIR_TRT_ENABLE_EXECUTOR@ config.enable_asan = @ENABLE_ASAN@ config.tensorrt_lib_dir = "@MLIR_TRT_TENSORRT_LIB_DIR@" diff --git a/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg b/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg index 9e414568f..8e80f44d9 100644 --- a/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg +++ b/mlir-tensorrt/compiler/test/python/IntegrationTests/lit.local.cfg @@ -2,3 +2,5 @@ if not config.enable_bindings_python: config.unsupported = True if not "host-has-at-least-1-gpus" in config.available_features: config.unsupported = True + +config.parallelism_group = "heavy" diff --git a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py index 17bda29d4..26fa48c8b 100644 --- a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py +++ b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/compiler_api/test_compiler_debug_dump.py @@ -40,6 +40,8 @@ def compile_asm(ASM): f"--mlir-print-ir-tree-dir={mlir_tree_path.name}", f"--tensorrt-layer-info-dir={trt_path.name}", f"--tensorrt-engines-dir={trt_path.name}", + "--mlir-elide-elementsattrs-if-larger=1024", + "--mlir-elide-resource-strings-if-larger=1024", ], ) task.run(m.operation) diff --git a/mlir-tensorrt/compiler/tools/CMakeLists.txt b/mlir-tensorrt/compiler/tools/CMakeLists.txt index def21c3c3..fe43ec083 100644 --- a/mlir-tensorrt/compiler/tools/CMakeLists.txt +++ b/mlir-tensorrt/compiler/tools/CMakeLists.txt @@ -16,6 +16,4 @@ set(LLVM_LINK_COMPONENTS add_subdirectory(mlir-tensorrt-opt) add_subdirectory(mlir-tensorrt-translate) add_subdirectory(mlir-tensorrt-lsp-server) -if(MLIR_TRT_TARGET_LUA) - add_subdirectory(mlir-tensorrt-runner) -endif() +add_subdirectory(mlir-tensorrt-runner) diff --git a/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp b/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp index 63597bf24..3fb3a9205 100644 --- a/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp +++ b/mlir-tensorrt/compiler/tools/mlir-tensorrt-opt/mlir-tensorrt-opt.cpp @@ -23,9 +23,9 @@ //===----------------------------------------------------------------------===// #include "mlir-tensorrt-dialect/Target/Passes.h" #include "mlir-tensorrt-dialect/Target/TranslateToTensorRT.h" -#include "mlir-tensorrt/Registration/InitLLVMExtensions.h" -#include "mlir-tensorrt/Registration/RegisterMlirTensorRtDialects.h" -#include "mlir-tensorrt/Registration/RegisterMlirTensorRtPasses.h" +#include "mlir-tensorrt/InitAllDialects.h" +#include "mlir-tensorrt/InitAllExtensions.h" +#include "mlir-tensorrt/InitAllPasses.h" #include "mlir/Tools/mlir-opt/MlirOptMain.h" using namespace mlir; @@ -50,16 +50,15 @@ static void registerTestPasses() { int main(int argc, char **argv) { mlir::DialectRegistry registry; - mlir::registerAllMlirTensorRtDialects(registry); - - mlirtrt::registerConvertToLLVMExtensions(registry); + mlirtrt::compiler::registerAllDialects(registry); + mlirtrt::compiler::registerAllExtensions(registry); mlir::registerTestTensorRTShapeInferencePass(); #ifdef MLIR_TRT_TARGET_TENSORRT mlir::tensorrt::registerTensorRTTranslationCLOpts(); mlir::tensorrt::registerTensorRTTranslationPasses(); #endif - mlir::tensorrt::registerAllMlirTensorRtPasses(); + mlirtrt::compiler::registerAllPasses(); #ifdef MLIR_TRT_ENABLE_TESTING registerTestPasses(); #endif diff --git a/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp b/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp index 12cb5c6f0..a4621a0cc 100644 --- a/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp +++ b/mlir-tensorrt/compiler/tools/mlir-tensorrt-translate/mlir-tensorrt-translate.cpp @@ -21,6 +21,9 @@ // This file is the entry point for the `mlir-tensorrt-translate` tool. // //===----------------------------------------------------------------------===// +#include "mlir-executor/Target/Lua/TranslateToLua.h" +#include "mlir-executor/Target/Lua/TranslateToRuntimeExecutable.h" +#include "mlir-tensorrt/Features.h" #include "mlir/InitAllTranslations.h" #include "mlir/Tools/mlir-translate/MlirTranslateMain.h" @@ -28,23 +31,15 @@ #include "mlir-tensorrt-dialect/Target/TranslateToTensorRT.h" #endif // MLIR_TRT_TARGET_TENSORRT -#ifdef MLIR_TRT_TARGET_LUA -#include "mlir-executor/Target/Lua/TranslateToLua.h" -#include "mlir-executor/Target/Lua/TranslateToRuntimeExecutable.h" -#endif // MLIR_TRT_TARGET_LUA - int main(int argc, char **argv) { mlir::registerToCppTranslation(); - -#ifdef MLIR_TRT_TARGET_TENSORRT - mlir::tensorrt::registerTensorRTTranslationCLOpts(); - mlir::registerToTensorRTTranslation(); -#endif // MLIR_TRT_TARGET_TENSORRT - -#ifdef MLIR_TRT_TARGET_LUA mlir::registerToLuaTranslation(); mlir::registerToRuntimeExecutableTranslation(); -#endif // MLIR_TRT_TARGET_LUA + + IF_MLIR_TRT_TARGET_TENSORRT({ + mlir::tensorrt::registerTensorRTTranslationCLOpts(); + mlir::registerToTensorRTTranslation(); + }); return failed(mlir::mlirTranslateMain(argc, argv, "MLIR-TensorRT Translation Tool")) diff --git a/mlir-tensorrt/executor/CMakeLists.txt b/mlir-tensorrt/executor/CMakeLists.txt index 664ab5199..7a934de5f 100644 --- a/mlir-tensorrt/executor/CMakeLists.txt +++ b/mlir-tensorrt/executor/CMakeLists.txt @@ -15,7 +15,6 @@ mlir_executor_option(MLIR_EXECUTOR_ENABLE_NCCL "Enable use of NCCL in the runtim mlir_executor_option(MLIR_EXECUTOR_ENABLE_MPI "Enable use of MPI in the runtime" ON) mlir_executor_option(MLIR_EXECUTOR_ENABLE_TENSORRT "Enable TensorRT runtime module" ON) mlir_executor_option(MLIR_EXECUTOR_ENABLE_CUDA "Enable CUDA runtime module" ON) -mlir_executor_option(MLIR_EXECUTOR_TARGET_LUA "Enable Lua translations and runtime backend" ON) mlir_executor_option(MLIR_EXECUTOR_ENABLE_GPU_INTEGRATION_TESTS "Enable integration tests that require GPU" ON) #------------------------------------------------------------------------------- diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt b/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt index b8ba504cd..16c80d4c2 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/CMakeLists.txt @@ -1,5 +1,3 @@ add_subdirectory(Common) add_subdirectory(C) -if(MLIR_EXECUTOR_TARGET_LUA) - add_subdirectory(Lua) -endif() +add_subdirectory(Lua) diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp index dfe6d9360..420e0846a 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp @@ -61,6 +61,7 @@ using namespace mlirtrt::runtime; static constexpr uint64_t kMinConstantBufferByteAlignment = 8; +#ifndef MLIR_EXECUTOR_ENABLE_NCCL /// If the runtime is not built with MLIR_EXECUTOR_ENABLE_NCCL, then this /// function registers default implementations for the required SPMD functions, /// reflecting that the executable is expected to run against a single fixed @@ -76,6 +77,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state, return deviceIdx; }; } +#endif // MLIR_EXECUTOR_ENABLE_NCCL namespace mlirtrt::runtime { void registerLuaCoreRuntimeExtension(); diff --git a/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp b/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp index 072b0414f..2918cf11b 100644 --- a/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp +++ b/mlir-tensorrt/executor/lib/Target/Lua/TranslateToLua.cpp @@ -842,9 +842,9 @@ LogicalResult LuaEmitter::emitBlock(Block &block, bool isEntryBlock) { // block if they are used outside of the block. for (Operation &op : otherBlock) { for (Value result : op.getResults()) { - bool usedOutside = - llvm::any_of(result.getUsers(), [&](Operation *userOp) { - return userOp->getBlock() != &otherBlock; + bool usedOutside = llvm::any_of( + result.getUsers(), [otherBlock = &otherBlock](Operation *userOp) { + return userOp->getBlock() != otherBlock; }); if (usedOutside) { getStream() << "local " << createLocalVariableName(result) diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt index 441579e1b..60510e5d1 100644 --- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt +++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/CMakeLists.txt @@ -1,9 +1,6 @@ include_directories(${MLIR_INCLUDE_DIRS}) set(_TABLEGEN_ARGS -gen-pass-decls -name TensorRTTranslation) set(LLVM_TARGET_DEFINITIONS Passes.td) -if(MLIR_TRT_TARGET_LUA) - list(APPEND _TABLEGEN_ARGS -DMLIR_TRT_TARGET_LUA) -endif() mlir_tablegen(Passes.h.inc ${_TABLEGEN_ARGS}) add_public_tablegen_target(MLIRTensorRTTranslationPassIncGen) diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h index 2d3b1e758..5c41798b5 100644 --- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h +++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Target/TranslateToTensorRT.h @@ -41,21 +41,6 @@ class Operation; class Pass; namespace tensorrt { -/// A simple logger that implements TensorRT's logging interface. Errors and -/// warnings are reported stderr. If the 'verbose' flags is active, then all -/// messages are printed to stderr. -class Logger : public nvinfer1::ILogger { -public: - explicit Logger(bool verbose = false) : verbose(verbose) {} - -protected: - void log(Severity severity, const char *msg) noexcept override; - - /// Print only 'error' and 'warning' messages if false, otehrwise print all - /// messages. - bool verbose; -}; - /// A llvm::cl::opt parser for turning strings like "1024gb" into a number of /// bytes. Allowed suffixes are strings like 'gb', 'GiB', 'kb', 'mb', 'b' (case /// insensitive, we interpret both 'b|B' as meaning "byte"). This example comes @@ -128,10 +113,8 @@ struct TensorRTTranslationOptions { class TensorRTBuilderContext { private: TensorRTBuilderContext(TensorRTVersion version, int32_t cudaDevice, - std::unique_ptr logger, std::unique_ptr builder) - : version(version), cudaDevice(cudaDevice), logger(std::move(logger)), - builder(std::move(builder)) {} + : version(version), cudaDevice(cudaDevice), builder(std::move(builder)) {} public: /// Create a TensorRTBuilderContext from a log configuration and CUDA device @@ -157,14 +140,10 @@ class TensorRTBuilderContext { /// Return which CUDA device the builder is associated with. int32_t getCudaDeviceNumber() const { return cudaDevice; } - /// Return a handle to the logger. - const std::unique_ptr &getLogger() const { return logger; } - private: TensorRTVersion version; /// The CUDA device that this builder is associated with. int32_t cudaDevice; - std::unique_ptr logger; std::unique_ptr builder; }; diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h index 987bca0b1..0b8539f0e 100644 --- a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h +++ b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/Utils/NvInferAdaptor.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -181,6 +182,7 @@ Weights trtSetWeights(WeightsMap &weightsMap, const char *name, const std::vector &w) { weightsMap[name] = std::vector(w.size() * sizeof(T)); std::vector &data = weightsMap[name]; + std::memcpy(data.data(), w.data(), w.size()); DataType dt = DataType::kFLOAT; if (std::is_same::value) { dt = DataType::kFLOAT; diff --git a/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp b/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp index 288f7e573..2a8419c6c 100644 --- a/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp +++ b/mlir-tensorrt/tensorrt/lib/Analysis/TensorKindAnalysis.cpp @@ -151,6 +151,11 @@ LogicalResult TensorKindAnalysis::visitOperation( return success(); } + if (auto tensorInsertOp = dyn_cast(op)) { + setInferredType(tensorInsertOp.getDestMutable(), TensorKind::Host); + return success(); + } + if (auto bufferizeOp = dyn_cast(op)) { // It has no tensor operands, nothing to do. if (!bufferizeOp.getCopy() || !bufferizeOp.getMemorySpace()) { @@ -158,7 +163,6 @@ LogicalResult TensorKindAnalysis::visitOperation( } if (auto memSpace = dyn_cast_or_null( bufferizeOp.getMemorySpaceAttr())) { - if (memSpace.getTensorKind().isHostOnly()) { setInferredType(bufferizeOp.getCopyMutable()[0], TensorKind::Device); return success(); diff --git a/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp b/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp index 3a07fdd69..404bfe7eb 100644 --- a/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp +++ b/mlir-tensorrt/tensorrt/lib/Target/TranslateToTensorRT.cpp @@ -49,6 +49,7 @@ #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/Threading.h" #include "llvm/Support/ToolOutputFile.h" +#include #define DEBUG_TYPE "translate-to-tensorrt" #define DBGS() llvm::dbgs() << "[" DEBUG_TYPE "] " @@ -63,6 +64,61 @@ namespace tensorrt { using namespace mlir; using namespace mlir::tensorrt; +//===----------------------------------------------------------------------===// +// Global TensorRT Logger +//===----------------------------------------------------------------------===// + +namespace { +/// A simple logger that implements TensorRT's logging interface. Errors and +/// warnings are reported stderr. If the 'verbose' flags is active, then all +/// messages are printed to stderr. +class Logger : public nvinfer1::ILogger { +public: + static Logger &getInstance(bool verbose) { + static Logger instance; + instance.setVerbose(verbose); + return instance; + } + + void setVerbose(bool verbose) { + std::scoped_lock guard(lock); + this->verbose = verbose; + } + +protected: + Logger() = default; + Logger(const Logger &) = delete; + Logger &operator=(const Logger &) = delete; + + void log(Severity severity, const char *msg) noexcept override; + + /// Print only 'error' and 'warning' messages if false, otehrwise print all + /// messages. + bool verbose; + + std::mutex lock; +}; +} // namespace + +void Logger::log(Severity severity, const char *msg) noexcept { + if (verbose) { + std::scoped_lock g(lock); + llvm::errs() << msg << "\n"; + return; + } + + if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR || + severity == Severity::kWARNING) { + std::scoped_lock g(lock); + llvm::errs() << msg << "\n"; + return; + } +} + +//===----------------------------------------------------------------------===// +// ByteSizeParser +//===----------------------------------------------------------------------===// + bool ByteSizeParser::parse(llvm::cl::Option &option, StringRef argName, StringRef arg, std::optional &val) { val = std::nullopt; @@ -243,23 +299,6 @@ TensorRTTranslationOptions TensorRTTranslationOptions::fromCLFlags() { return options; } -//===----------------------------------------------------------------------===// -// Logger -//===----------------------------------------------------------------------===// - -void tensorrt::Logger::log(Severity severity, const char *msg) noexcept { - if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) { - llvm::errs() << msg << "\n"; - return; - } - if (severity == Severity::kWARNING) { - llvm::errs() << msg << "\n"; - return; - } - if (verbose) - llvm::errs() << msg << "\n"; -} - //===----------------------------------------------------------------------===// // TensorRTBuilderContext //===----------------------------------------------------------------------===// @@ -292,17 +331,13 @@ TensorRTBuilderContext::create(bool verbose, int32_t cudaDevice) { if (status != cudaSuccess) return failure(); - auto logger = std::make_unique(verbose); - if (!logger) - return failure(); - auto builder = std::unique_ptr( - nvinfer1::createInferBuilder(*logger)); + nvinfer1::createInferBuilder(Logger::getInstance(verbose))); if (!builder) return failure(); - return std::shared_ptr(new TensorRTBuilderContext( - version, cudaDevice, std::move(logger), std::move(builder))); + return std::shared_ptr( + new TensorRTBuilderContext(version, cudaDevice, std::move(builder))); } //===----------------------------------------------------------------------===// @@ -830,7 +865,8 @@ class TranslateToTensorRTEnginePass if (!translationOptions->saveTensorRTLayerInfoDirectory.empty()) { std::unique_ptr runtime{ - nvinfer1::createInferRuntime(*builderContext->getLogger())}; + nvinfer1::createInferRuntime( + Logger::getInstance(translationOptions->enableVerboseLogs))}; std::unique_ptr cudaEngine{ runtime->deserializeCudaEngine(serializedEngine->data(), serializedEngine->size())}; diff --git a/mlir-tensorrt/tensorrt/test/lit.cfg.py b/mlir-tensorrt/tensorrt/test/lit.cfg.py index a98a6b41e..32e80a712 100644 --- a/mlir-tensorrt/tensorrt/test/lit.cfg.py +++ b/mlir-tensorrt/tensorrt/test/lit.cfg.py @@ -5,9 +5,9 @@ import lit.formats import lit.util - from lit.llvm import llvm_config from lit.llvm.subst import ToolSubst +import psutil # Configuration file for the 'lit' test runner. @@ -114,14 +114,27 @@ def all_gpus_have_fp8_support() -> bool: config.available_features.add("no-asan") -def estimate_parallelism(mem_required: float) -> int: +def estimate_paralllelism( + gb_gpu_mem_required: float, gb_sys_mem_required: float +) -> int: try: + parallelism = 2 with gpu_tools.nvml_context() as devices: - return gpu_tools.estimate_parallelism_from_memory(devices, mem_required) + parallelism = gpu_tools.estimate_parallelism_from_memory( + devices, gb_gpu_mem_required + ) + return int( + min( + parallelism, + (psutil.virtual_memory().available / (1024**3)) // gb_sys_mem_required, + ) + ) except: - return 1 + return 2 # Setup the parallelism groups. -lit_config.parallelism_groups["translation-tests"] = estimate_parallelism(8.0) +lit_config.parallelism_groups["translation-tests"] = estimate_paralllelism( + 8.0, gb_sys_mem_required=3.0 +) lit_config.parallelism_group = None