2121#include " tensorrt_llm/nanobind/common/customCasters.h"
2222#include " tensorrt_llm/runtime/cudaStream.h"
2323#include " tensorrt_llm/runtime/utils/mpiUtils.h"
24+ #include < cstdint>
2425#include < nanobind/nanobind.h>
2526#include < nanobind/stl/function.h>
2627#include < nanobind/stl/map.h>
@@ -111,11 +112,11 @@ void initConfigBindings(nb::module_& m)
111112 self.getSinkTokenLength (), self.getFreeGpuMemoryFraction (), self.getHostCacheSize (),
112113 self.getOnboardBlocks (), self.getCrossKvCacheFraction (), self.getSecondaryOffloadMinPriority (),
113114 self.getEventBufferMaxSize (), self.getEnablePartialReuse (), self.getCopyOnPartialReuse (), self.getUseUvm (),
114- self.getAttentionDpEventsGatherPeriodMs ());
115+ self.getAttentionDpEventsGatherPeriodMs (), self. getMaxGpuTotalBytes () );
115116 };
116117 auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const & state)
117118 {
118- if (state.size () != 14 )
119+ if (state.size () != 15 )
119120 {
120121 throw std::runtime_error (" Invalid state!" );
121122 }
@@ -125,20 +126,21 @@ void initConfigBindings(nb::module_& m)
125126 nb::cast<bool >(state[6 ]), nb::cast<std::optional<float >>(state[7 ]),
126127 nb::cast<std::optional<tle::RetentionPriority>>(state[8 ]), nb::cast<size_t >(state[9 ]),
127128 nb::cast<bool >(state[10 ]), nb::cast<bool >(state[11 ]), nb::cast<bool >(state[12 ]),
128- nb::cast<SizeType32>(state[13 ]));
129+ nb::cast<SizeType32>(state[13 ]), std:: nullopt , nb::cast< uint64_t >(state[ 14 ]) );
129130 };
130131 nb::class_<tle::KvCacheConfig>(m, " KvCacheConfig" )
131132 .def (nb::init<bool , std::optional<SizeType32> const &, std::optional<std::vector<SizeType32>> const &,
132133 std::optional<SizeType32> const &, std::optional<float > const &, std::optional<size_t > const &, bool ,
133134 std::optional<float > const &, std::optional<tle::RetentionPriority>, size_t const &, bool , bool , bool ,
134- SizeType32, std::optional<RuntimeDefaults> const &>(),
135+ SizeType32, std::optional<RuntimeDefaults> const &, uint64_t const & >(),
135136 nb::arg (" enable_block_reuse" ) = true , nb::arg (" max_tokens" ) = nb::none (),
136137 nb::arg (" max_attention_window" ) = nb::none (), nb::arg (" sink_token_length" ) = nb::none (),
137138 nb::arg (" free_gpu_memory_fraction" ) = nb::none (), nb::arg (" host_cache_size" ) = nb::none (),
138139 nb::arg (" onboard_blocks" ) = true , nb::arg (" cross_kv_cache_fraction" ) = nb::none (),
139140 nb::arg (" secondary_offload_min_priority" ) = nb::none (), nb::arg (" event_buffer_max_size" ) = 0 , nb::kw_only (),
140141 nb::arg (" enable_partial_reuse" ) = true , nb::arg (" copy_on_partial_reuse" ) = true , nb::arg (" use_uvm" ) = false ,
141- nb::arg (" attention_dp_events_gather_period_ms" ) = 5 , nb::arg (" runtime_defaults" ) = nb::none ())
142+ nb::arg (" attention_dp_events_gather_period_ms" ) = 5 , nb::arg (" runtime_defaults" ) = nb::none (),
143+ nb::arg (" max_gpu_total_bytes" ) = 0 )
142144 .def_prop_rw (
143145 " enable_block_reuse" , &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
144146 .def_prop_rw (" max_tokens" , &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@@ -163,6 +165,8 @@ void initConfigBindings(nb::module_& m)
163165 .def_prop_rw (" use_uvm" , &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
164166 .def_prop_rw (" attention_dp_events_gather_period_ms" , &tle::KvCacheConfig::getAttentionDpEventsGatherPeriodMs,
165167 &tle::KvCacheConfig::setAttentionDpEventsGatherPeriodMs)
168+ .def_prop_rw (
169+ " max_gpu_total_bytes" , &tle::KvCacheConfig::getMaxGpuTotalBytes, &tle::KvCacheConfig::setMaxGpuTotalBytes)
166170 .def (" fill_empty_fields_from_runtime_defaults" , &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
167171 .def (" __getstate__" , kvCacheConfigGetstate)
168172 .def (" __setstate__" , kvCacheConfigSetstate);
0 commit comments