QiJune
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/kernels/xqa/CMakeLists.txt‎
Lines changed: 23 additions & 14 deletions b/‎cpp/kernels/xqa/CMakeLists.txt‎
Lines changed: 23 additions & 14 deletions
diff --git a/‎cpp/kernels/xqa/barriers.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cpp/kernels/xqa/barriers.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -101,12 +101,18 @@ Developer workflow for code contributions is as follows:
 
 The naming of the merge requests in TensorRT-LLM follows the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/). If the PR includes an API change that might break user code/API usage, consider adding "BREAKING CHANGE" in the title so that reviewers know what to expect. Additionally, if the PR is not related to any bug and task, consider using "chore" or None as the placeholder.
 
+[!IMPORTANT]
+For NVIDIA developers, please include the JIRA number or NVBUG ID in the PR title whenever possible.
+
 Good PR Titles Examples:
 * feat: Add support for starcoder-v2 FP8 base + FP16/BF16 LoRA
 * BREAKING CHANGE: Set default max batch size to 2048
 * chore: Remove version from plugins .so
 * None: Stringized enums for better error msgs
 * fix https://github.com/NVIDIA/TensorRT-LLM/issues/700: a Memory leak issue in C++ runtime
+* [TRTLLM-5516] perf: replicate dummy request for cuda graph padding (**NVIDIAN only**)
+* [nvbug/5334370] fix: Fix one model EAGLE3 (**NVIDIAN only**)
+
 
 This is important for tracking and collecting what has been submitted to which release and makes it easier for others to track the bugs or tasks. It could also be helpful when collecting GitHub publish announcement.
 
@@ -118,6 +124,13 @@ In the PR description, please consider addressing these points:
 * Potential performance or functional impacts of the changes. If there are risks, please inform the reviewers.
 * Link to the related PRs.
 
+[!IMPORTANT]
+For NVIDIA developers,  please submit feature or bug fixes to the dedicated branch specified in the nvbug
+**Keywords** field. For example, if a bug is reported on the release/v0.20 branch, please submit the fix to
+`release/v0.20` instead of the main branch.
+
+Meanwhile, please add the "release blocker" label to any PRs that could potentially cause a release delay.
+
 
 ## Tests and Code Review for Protected APIs
 
 
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.21.0rc2-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.21.0rc3-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -84,21 +84,30 @@ add_custom_command(
 add_custom_target(xqa_sources_h DEPENDS ${XQA_SOURCES_H})
 
 if(BUILD_XQA_TESTS)
-  # GoogleTest Preparation - Code block copied from
-  # https://google.github.io/googletest/quickstart-cmake.html
-  include(FetchContent)
-  FetchContent_Declare(
-    googletest
-    GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG v1.15.2)
-  include(GoogleTest)
+  # Try to find system installed GTest first
+  find_package(GTest QUIET)
+  if(NOT GTest_FOUND)
+    message(STATUS "System GTest not found, fetching from repository")
+    include(FetchContent)
+    FetchContent_Declare(
+      googletest
+      GIT_REPOSITORY https://github.com/google/googletest.git
+      GIT_TAG v1.15.2)
+    FetchContent_MakeAvailable(googletest)
+    include(GoogleTest)
+  endif()
 
-  # Add Eigen via FetchContent
-  FetchContent_Declare(
-    eigen
-    GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
-    GIT_TAG 3.4.0)
-  FetchContent_MakeAvailable(googletest eigen)
+  # Try to find system installed Eigen first
+  find_package(Eigen3 3.4 QUIET)
+  if(NOT Eigen3_FOUND)
+    message(STATUS "System Eigen not found, fetching from repository")
+    include(FetchContent)
+    FetchContent_Declare(
+      eigen
+      GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
+      GIT_TAG 3.4.0)
+    FetchContent_MakeAvailable(eigen)
+  endif()
 
   enable_testing()
   add_executable(
 
@@ -434,7 +434,7 @@ using CtaBarrier = MBarrier<Scope::CTA>;
 using CgaBarrier = MBarrier<Scope::CGA>;
 
 template <uint32_t nbBars>
-__device__ inline bool toParity(uint32_t i)
+__device__ inline constexpr bool toParity(uint32_t i)
 {
     return i % (nbBars * 2) / nbBars;
 }
Original file line number	Diff line number	Diff line change
`@@ -434,7 +434,7 @@ using CtaBarrier = MBarrier<Scope::CTA>;`
`434`	`434`	`using CgaBarrier = MBarrier<Scope::CGA>;`
`435`	`435`
`436`	`436`	`template <uint32_t nbBars>`
`437`		`-__device__ inline bool toParity(uint32_t i)`
	`437`	`+__device__ inline constexpr bool toParity(uint32_t i)`
`438`	`438`	`{`
`439`	`439`	`return i % (nbBars * 2) / nbBars;`
`440`	`440`	`}`