fix: nanobind bindings

Funatiq · Funatiq · commit 7654d08bbf35 · 2025-08-01T10:36:17.000Z
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -6,6 +6,7 @@ set(TRTLLM_NB_MODULE
 set(SRCS
     batch_manager/algorithms.cpp
     batch_manager/bindings.cpp
+    batch_manager/buffers.cpp
     batch_manager/cacheTransceiver.cpp
     batch_manager/kvCacheManager.cpp
     batch_manager/llmRequest.cpp
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -384,39 +384,6 @@ void initBindings(nb::module_& m)
         .def(nb::init<tr::SizeType32, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
             nb::arg("max_num_sequences"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
 
-    nb::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(), nb::arg("max_batch_size"),
-            nb::arg("max_tokens_per_engine_step"), nb::arg("manager"))
-        .def_rw("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
-        .def_rw("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
-        .def_rw("fill_values", &tb::DecoderInputBuffers::fillValues)
-        .def_rw("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
-        .def_rw("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
-        .def_rw("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
-        .def_rw("logits", &tb::DecoderInputBuffers::logits)
-        .def_rw("decoder_requests", &tb::DecoderInputBuffers::decoderRequests);
-
-    nb::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
-        .def_rw("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
-        .def_rw("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
-        .def_prop_ro("new_output_tokens_host",
-            [](tb::DecoderOutputBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
-        .def_rw("cum_log_probs_host", &tb::DecoderOutputBuffers::cumLogProbsHost)
-        .def_rw("log_probs_host", &tb::DecoderOutputBuffers::logProbsHost)
-        .def_rw("finish_reasons_host", &tb::DecoderOutputBuffers::finishReasonsHost);
-
-    nb::class_<tb::SlotDecoderBuffers>(m, "SlotDecoderBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&>(),
-            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"))
-        .def_rw("output_ids", &tb::SlotDecoderBuffers::outputIds)
-        .def_rw("output_ids_host", &tb::SlotDecoderBuffers::outputIdsHost)
-        .def_rw("sequence_lengths_host", &tb::SlotDecoderBuffers::sequenceLengthsHost)
-        .def_rw("cum_log_probs", &tb::SlotDecoderBuffers::cumLogProbs)
-        .def_rw("cum_log_probs_host", &tb::SlotDecoderBuffers::cumLogProbsHost)
-        .def_rw("log_probs", &tb::SlotDecoderBuffers::logProbs)
-        .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
-        .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
-
     m.def(
         "add_new_tokens_to_requests",
         [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
@@ -435,10 +402,10 @@ void initBindings(nb::module_& m)
 
     m.def(
         "make_decoding_batch_input",
-        [](std::vector<std::shared_ptr<tb::LlmRequest>>& contextRequests,
-            std::vector<std::shared_ptr<tb::LlmRequest>>& genRequests, tr::ITensor::SharedPtr logits, int beamWidth,
-            std::vector<int> const& numContextLogitsPrefixSum, tb::DecoderInputBuffers const& decoderInputBuffers,
-            runtime::decoder::DecoderState& decoderState, tr::BufferManager const& manager)
+        [](tb::DecoderInputBuffers& decoderInputBuffers, runtime::decoder::DecoderState& decoderState,
+            std::vector<std::shared_ptr<tb::LlmRequest>> const& contextRequests,
+            std::vector<std::shared_ptr<tb::LlmRequest>> const& genRequests, tr::ITensor::SharedPtr const& logits,
+            int beamWidth, std::vector<int> const& numContextLogitsPrefixSum, tr::BufferManager const& manager)
         {
             std::vector<int> activeSlots;
             std::vector<int> generationSteps;
@@ -496,21 +463,18 @@ void initBindings(nb::module_& m)
                 batchSlotsRange[i] = activeSlots[i];
             }
 
-            auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, 1);
-            decodingInput->batchSlots = batchSlots;
+            decoderInputBuffers.batchLogits = logitsVec;
 
             auto const maxBeamWidth = decoderState.getMaxBeamWidth();
             if (maxBeamWidth > 1)
             {
                 // For Variable-Beam-Width-Search
                 decoderState.getJointDecodingInput().generationSteps = generationSteps;
             }
-
-            return decodingInput;
         },
-        nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("logits"), nb::arg("beam_width"),
-        nb::arg("num_context_logits_prefix_sum"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
-        nb::arg("buffer_manager"), "Make decoding batch input.");
+        nb::arg("decoder_input_buffers"), nb::arg("decoder_state"), nb::arg("context_requests"),
+        nb::arg("generation_requests"), nb::arg("logits"), nb::arg("beam_width"),
+        nb::arg("num_context_logits_prefix_sum"), nb::arg("buffer_manager"), "Make decoding batch input.");
 }
 
 } // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "buffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
+#include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tr = tensorrt_llm::runtime;
+
+using tr::SizeType32;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void Buffers::initBindings(nb::module_& m)
+{
+    nb::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(), nb::arg("max_batch_size"),
+            nb::arg("max_tokens_per_engine_step"), nb::arg("manager"))
+        .def_rw("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
+        .def_rw("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
+        .def_rw("fill_values", &tb::DecoderInputBuffers::fillValues)
+        .def_rw("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
+        .def_rw("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
+        .def_rw("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
+        .def_rw("decoder_logits", &tb::DecoderInputBuffers::decoderLogits)
+        .def_rw("decoder_requests", &tb::DecoderInputBuffers::decoderRequests);
+
+    nb::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
+        .def_rw("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
+        .def_rw("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
+        .def_prop_ro("new_output_tokens_host",
+            [](tb::DecoderOutputBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
+        .def_rw("cum_log_probs_host", &tb::DecoderOutputBuffers::cumLogProbsHost)
+        .def_rw("log_probs_host", &tb::DecoderOutputBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::DecoderOutputBuffers::finishReasonsHost);
+
+    nb::class_<tb::SlotDecoderBuffers>(m, "SlotDecoderBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&>(),
+            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"))
+        .def_rw("output_ids", &tb::SlotDecoderBuffers::outputIds)
+        .def_rw("output_ids_host", &tb::SlotDecoderBuffers::outputIdsHost)
+        .def_rw("sequence_lengths_host", &tb::SlotDecoderBuffers::sequenceLengthsHost)
+        .def_rw("cum_log_probs", &tb::SlotDecoderBuffers::cumLogProbs)
+        .def_rw("cum_log_probs_host", &tb::SlotDecoderBuffers::cumLogProbsHost)
+        .def_rw("log_probs", &tb::SlotDecoderBuffers::logProbs)
+        .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
+}
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+class Buffers
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -33,6 +33,7 @@
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/nanobind/batch_manager/algorithms.h"
 #include "tensorrt_llm/nanobind/batch_manager/bindings.h"
+#include "tensorrt_llm/nanobind/batch_manager/buffers.h"
 #include "tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h"
 #include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
@@ -469,6 +470,7 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
         .def_prop_ro("pinned", &tr::MemoryCounters::getPinned)
         .def_prop_ro("uvm", &tr::MemoryCounters::getUVM);
 
+    tpb::Buffers::initBindings(mInternalBatchManager);
     tensorrt_llm::nanobind::runtime::initBindings(mInternalRuntime);
     tensorrt_llm::nanobind::testing::initBindings(mInternalTesting);
     tpb::initBindings(mInternalBatchManager);
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -116,10 +116,6 @@ void initBindings(nb::module_& m)
         .def_rw("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
         .def(nb::self == nb::self);
 
-    nb::class_<tr::BufferManager>(m, "BufferManager")
-        .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
-        .def_prop_ro("stream", &tr::BufferManager::getStream);
-
     nb::class_<tr::TllmRuntime>(m, "TllmRuntime")
         .def(
             "__init__",
@@ -157,14 +153,6 @@ void initBindings(nb::module_& m)
         .def_prop_ro("logits_dtype_from_engine",
             [](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });
 
-    nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
-        .def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
-            nb::arg("max_decoding_engine_tokens"))
-        .def(nb::init<std::vector<tr::ITensor::SharedConstPtr>>(), nb::arg("logits"))
-        .def_rw("logits", &tr::decoder_batch::Input::logits)
-        .def_rw("max_decoder_steps", &tr::decoder_batch::Input::maxDecoderSteps)
-        .def_rw("batch_slots", &tr::decoder_batch::Input::batchSlots);
-
     nb::class_<tr::LookaheadDecodingBuffers>(m, "LookaheadDecodingBuffers")
         .def(nb::init<tr::SizeType32, tr::SizeType32, tr::BufferManager const&>(), nb::arg("max_num_sequences"),
             nb::arg("max_tokens_per_step"), nb::arg("buffer_manager"))
@@ -343,6 +331,10 @@ void initBindings(nb::module_& m)
 
 void initBindingsEarly(nb::module_& m)
 {
+    nb::class_<tr::BufferManager>(m, "BufferManager")
+        .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
+        .def_prop_ro("stream", &tr::BufferManager::getStream);
+
     nb::class_<tr::SpeculativeDecodingMode>(m, "SpeculativeDecodingMode")
         .def(nb::init<tr::SpeculativeDecodingMode::UnderlyingType>(), nb::arg("state"))
         .def_static("NoneType", &tr::SpeculativeDecodingMode::None)