We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cde18c1 commit c37924fCopy full SHA for c37924f
tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -1528,6 +1528,11 @@ def get_indexer_k_cache_buffers(self, layer_idx: int):
1528
return self.indexer_k_cache_pool_per_layer[layer_offset].view(
1529
self.num_blocks, block_size, 1, per_token_size)
1530
1531
+ def shutdown(self):
1532
+ # Clear Python references BEFORE C++ frees the underlying CUDA buffers
1533
+ self.indexer_k_cache_pool_per_layer = []
1534
+ super().shutdown()
1535
+
1536
@staticmethod
1537
def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
1538
**kwargs):
0 commit comments