NVIDIA
diff --git a/‎examples/disaggregated/clients/prompts.json‎
Lines changed: 7 additions & 1 deletion b/‎examples/disaggregated/clients/prompts.json‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/distributed/communicator.py‎
Lines changed: 7 additions & 12 deletions b/‎tensorrt_llm/_torch/distributed/communicator.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 0 additions & 172 deletions b/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 0 additions & 172 deletions
diff --git a/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 2 additions & 4 deletions b/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 0 additions & 8 deletions b/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 5 additions & 15 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 5 additions & 15 deletions
@@ -1 +1,7 @@
-["Global warming is the long term rise in Earth temperature caused by greenhouse gases from human activity, burning fossil fuels, and deforestation. It leads to melting ice, rising seas, and extreme weather that threaten ecosystems, wildlife, and people. Urgent global action is "]
+[
+    "What is the capital of Germany?",
+    "Explain the theory of relativity.",
+    "What are the benefits of using asyncio in Python?",
+    "Describe the process of photosynthesis.",
+    "How does a blockchain work?"
+]
@@ -347,8 +347,6 @@ def __init__(self, mapping: Mapping):
         mapping_with_helix = None
         if self.mapping.cp_size > 1:
             print(f"[MPIDist::__init__] Repurposing CP ranks to TP for Helix.")
-            # TODO: More principled thing to do would be to update mapping to account for
-            # repurposing of CP ranks to TP.
             mapping_with_helix = copy.deepcopy(self.mapping)
             mapping_without_helix = Mapping(
                 world_size=self.mapping.world_size,
@@ -401,15 +399,20 @@ def recv_object(self, src, tag=0):
         return mpi_recv_object(src, tag)
 
     def create_tp_comm(self):
-        print(f"[MPIDist::create_tp_comm] rank: {self.mapping.rank}, tp_rank: {self.mapping.tp_rank}, tp_group: {self.mapping.tp_group}")
         new_group = mpi_comm().group.Incl(self.mapping.tp_group)
         self.tp_comm = mpi_comm().Create_group(new_group)
 
     def create_pp_comm(self):
-        print(f"[MPIDist::create_pp_comm] rank: {self.mapping.rank}, pp_rank: {self.mapping.pp_rank}, pp_group: {self.mapping.pp_group}")
         new_group = mpi_comm().group.Incl(self.mapping.pp_group)
         self.pp_comm = mpi_comm().Create_group(new_group)
 
+    def create_cp_comm(self):
+        new_group = mpi_comm().group.Incl(self.mapping.cp_group)
+        self.cp_comm = mpi_comm().Create_group(new_group)
+
+    def cp_allgather(self, obj):
+        return self.cp_comm.allgather(obj)
+
     def tp_allgather(self, obj):
         return self.tp_comm.allgather(obj)
 
@@ -430,14 +433,6 @@ def pp_gather(self, obj):
     def pp_broadcast(self, obj, root=0):
         return self.pp_comm.bcast(obj, root)
 
-    def create_cp_comm(self):
-        print(f"[MPIDist::create_cp_comm] rank: {self.mapping.rank}, cp_rank: {self.mapping.cp_rank}, cp_group: {self.mapping.cp_group}")
-        new_group = mpi_comm().group.Incl(self.mapping.cp_group)
-        self.cp_comm = mpi_comm().Create_group(new_group)
-
-    def cp_allgather(self, obj):
-        return self.cp_comm.allgather(obj)
-
 
 class MultiHandleWrapper:
     """
 
@@ -1577,7 +1577,6 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
                                                         PretrainedConfig]):
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig]):
-        ###############################################################################
         self.mapping_with_cp = None
         # Note: Currently the usage of mapping is all over the place making its usage brittle
         # in this file. As a temporary WAR, we hold on to an original copy of mapping when CP
@@ -1606,7 +1605,6 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
                 moe_ep_size=model_config.mapping.moe_ep_size,
                 enable_attention_dp=model_config.mapping.enable_attention_dp)
             model_config._frozen = True
-        ###############################################################################
 
         # Rename some keys of quant_config_dict to support legacy checkpoints
         if model_config.quant_config_dict is not None:
@@ -1656,7 +1654,6 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
             self.epilogue.extend(self.draft_model.mtp_layers)
             self.epilogue.append(self.spec_worker)
 
-        ###############################################################################
         # Undo any manipulations done to mapping.
         if self.mapping_with_cp is not None:
             print(
@@ -1665,7 +1662,6 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
             model_config._frozen = False
             model_config.mapping = self.mapping_with_cp
             model_config._frozen = True
-        ###############################################################################
 
     def forward(
         self,
@@ -1677,33 +1673,6 @@ def forward(
         return_context_logits: bool = False,
         **kwargs,
     ) -> torch.Tensor:
-        # with use_torch_printoptions(sci_mode=False,
-        #                             threshold=16,
-        #                             edgeitems=2,
-        #                             linewidth=120):
-        #     print(
-        #         f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] input_ids: {input_ids}"
-        #     )
-        #     print(
-        #         f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] position_ids: {position_ids}"
-        #     )
-        #     print(
-        #         f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}"
-        #     )
-        #     print(
-        #         f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}"
-        #     )
-        #     print(
-        #         f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}"
-        #     )
-        #     assert attn_metadata.kv_cache_manager.tokens_per_block == 32
-        #     block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(
-        #         attn_metadata.request_ids)
-        #     for request_id, block_ids in zip(attn_metadata.request_ids,
-        #                                      block_ids_per_seq):
-        #         print(
-        #             f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}] request_id: {request_id}, block_ids: {torch.tensor(block_ids)}"
-        #         )
         return super().forward(attn_metadata=attn_metadata,
                                input_ids=input_ids,
                                position_ids=position_ids,
@@ -1712,147 +1681,6 @@ def forward(
                                return_context_logits=return_context_logits,
                                **kwargs)
 
-    def _save_block_information_to_disk(self, attn_metadata: AttentionMetadata,
-                                        position_ids: torch.Tensor):
-        """Save KV cache block information to disk using safetensors format."""
-        import json
-        from pathlib import Path
-
-        import safetensors.torch
-
-        # Only save on rank 0 in prefill mode.
-        if (attn_metadata.helix_is_inactive_rank is not None
-                or self.model_config.mapping.rank != 0
-                or len(position_ids[0]) != 52):
-            return
-
-        # Create directory for saving block data
-        save_dir = Path(
-            "/home/bbuddharaju/scratch/TensorRT-LLM_MK/prefill_helix_all_layers"
-        )
-        save_dir.mkdir(exist_ok=True)
-
-        block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(
-            attn_metadata.request_ids)
-        for request_id, block_ids in zip(attn_metadata.request_ids,
-                                         block_ids_per_seq):
-            # Save blocks for requests with exactly 2 blocks.
-            if len(block_ids) == 2:
-                request_save_dir = save_dir / f"request_{request_id}"
-                request_save_dir.mkdir(exist_ok=True)
-
-                # Iterate through all layers and save KV cache buffers for each layer.
-                for layer_idx in range(self.config.num_hidden_layers):
-                    # Get KV cache buffers for this layer.
-                    kv_buffer = attn_metadata.kv_cache_manager.get_buffers(
-                        layer_idx)
-
-                    # Save each block separately for this layer.
-                    for i, block_id in enumerate(block_ids):
-                        # Get block data from KV cache for this layer.
-                        request_kv_data = kv_buffer[block_id]
-
-                        # Create separate data dictionary for this block.
-                        block_data = {"block_data": request_kv_data.cpu()}
-
-                        # Create separate metadata for this block, including layer information.
-                        block_metadata = {
-                            "request_id": int(request_id),
-                            "layer_idx": int(layer_idx),
-                            "block_id": int(block_id),
-                            "block_index": i,
-                            "block_shape": list(request_kv_data.shape),
-                            "tokens_per_block":
-                            attn_metadata.kv_cache_manager.tokens_per_block,
-                            "rank": self.model_config.mapping.rank,
-                        }
-
-                        # Save each block's data separately using safetensors, including layer in filename.
-                        block_safetensors_path = request_save_dir / f"layer_{layer_idx}_block_id_{block_id}_rank_{self.model_config.mapping.rank}.safetensors"
-                        safetensors.torch.save_file(block_data,
-                                                    str(block_safetensors_path))
-
-                        # Save each block's metadata separately as JSON, including layer in filename.
-                        block_metadata_path = request_save_dir / f"layer_{layer_idx}_block_id_{block_id}_rank_{self.model_config.mapping.rank}_metadata.json"
-                        with open(block_metadata_path, 'w') as f:
-                            json.dump(block_metadata, f, indent=2)
-
-                        print(
-                            f"[DeepseekV3ForCausalLM::_save_block_information_to_disk][rank {self.model_config.mapping.rank}] "
-                            f"Saved layer {layer_idx} block (ID: {block_id}) for request {request_id}, shape: {request_kv_data.shape} "
-                            f"to {block_safetensors_path.name}")
-
-                print(
-                    f"[DeepseekV3ForCausalLM::_save_block_information_to_disk][rank {self.model_config.mapping.rank}] "
-                    f"Saved block information for request {request_id} to {request_save_dir}"
-                )
-
-    def _read_block_information_from_disk(self,
-                                          attn_metadata: AttentionMetadata,
-                                          position_ids: torch.Tensor):
-        """Read KV cache block information from disk using safetensors format."""
-        from pathlib import Path
-
-        import safetensors.torch
-
-        # Early return in prefill mode.
-        if (attn_metadata.helix_is_inactive_rank is None):
-            return
-
-        # Early return if this isn't the first decode step.
-        if (position_ids[0][0].item() != 52):
-            print(
-                f"[DeepseekV3ForCausalLM::_save_block_information_to_disk][rank {self.model_config.mapping.rank}] "
-                f"Early return in decode mode because this isn't the first decode step {position_ids[0][0].item()}"
-            )
-            return
-
-        block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(
-            attn_metadata.request_ids)
-        for request_id, block_ids in zip(attn_metadata.request_ids,
-                                         block_ids_per_seq):
-
-            # Read blocks for requests with exactly 1 block.
-            assert len(block_ids) == 1
-
-            # Read KV cache for all layers.
-            for layer_idx in range(self.config.num_hidden_layers):
-                # Determine file path based on rank and layer.
-                if self.model_config.mapping.rank == 0:
-                    # Inactive rank.
-                    read_file = Path(
-                        f"/home/bbuddharaju/scratch/TensorRT-LLM_MK/prefill_helix_all_layers/request_2048/layer_{layer_idx}_block_id_257_rank_0.safetensors"
-                    )
-                else:
-                    # Active rank.
-                    read_file = Path(
-                        f"/home/bbuddharaju/scratch/TensorRT-LLM_MK/prefill_helix_all_layers/request_2048/layer_{layer_idx}_block_id_258_rank_0.safetensors"
-                    )
-
-                # Get KV cache buffers for this layer.
-                kv_buffer = attn_metadata.kv_cache_manager.get_buffers(
-                    layer_idx)
-
-                # Get block data from KV cache.
-                request_kv_data = kv_buffer[block_ids[0]]
-
-                # Load block data from disk.
-                loaded_data = safetensors.torch.load_file(read_file)
-                block_read_data = loaded_data['block_data'].to(
-                    request_kv_data.device)
-
-                # Copy block data to KV cache.
-                request_kv_data.copy_(block_read_data)
-
-                print(
-                    f"[DeepseekV3ForCausalLM::_read_block_information_from_disk][rank {self.model_config.mapping.rank}] "
-                    f"Layer {layer_idx}: request_kv_data: {request_kv_data}")
-
-                print(
-                    f"[DeepseekV3ForCausalLM::_read_block_information_from_disk][rank {self.model_config.mapping.rank}] "
-                    f"Read block data for request {request_id}, layer {layer_idx}, shape: {block_read_data.shape} "
-                    f"from {read_file.name}")
-
     def load_weights(self, weights: Dict):
         weight_loader = DeepseekV3WeightLoader(self)
         weight_loader.load_weights(weights)
 
@@ -709,6 +709,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.num_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         assert self.num_heads == self.num_key_value_heads, "num_heads must be equal to num_key_value_heads"
         self.qk_nope_head_dim = qk_nope_head_dim
         self.qk_rope_head_dim = qk_rope_head_dim
@@ -761,7 +762,7 @@ def __init__(
         if self.mapping.has_cp_ulysses():
             raise NotImplementedError("MLA doesn't support CP Ulyssees yet")
         if self.mapping.cp_size > 1:
-            assert self.mapping.cp_config['cp_type'] == CpType.HELIX
+            assert self.mapping.cp_config['cp_type'] == CpType.HELIX, f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
 
         mapping = Mapping(
             world_size=tp_size * pp_size * cp_size,
@@ -1093,9 +1094,6 @@ def _attn_forward_gen(self, attn_backend: AttentionBackend, q: torch.Tensor,
 
     def create_output(self, hidden_states: torch.Tensor, num_contexts: int):
         num_tokens = hidden_states.shape[0]
-        # note: for testing Helix parallelism, we ensure that the output is
-        # large enough for the context phase, but we then cut it again in
-        # `forward_context`
         hidden_size = self.o_proj.in_features
         if self.enable_unit_test and num_contexts > 0:
             # note: for testing Helix parallelism, we ensure that the output is
 
@@ -313,7 +313,6 @@ def _fetch_and_process_requests(
         new_requests = self._validate_and_filter_requests(new_requests)
 
         # Attach Python objects to requests
-        # @B: What's the significance of this condition?
         if py_request_objects and (self.dist.tp_size > 1 or self.dist.has_pp
                                    or self.dist.cp_size
                                    > 1) and self.dist.rank > 0:
@@ -693,13 +692,6 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
                 input_ids_this_rank = input_ids_this_rank[:-padding_len]
                 position_ids_this_rank = position_ids_this_rank[:-padding_len]
 
-            print(
-                f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: input_ids_this_rank: {torch.tensor(input_ids_this_rank)}"
-            )
-            print(
-                f"[ExecutorRequestQueue::_merge_helix_requests][{curr_cp_rank}]: position_ids_this_rank: {torch.tensor(position_ids_this_rank)}"
-            )
-            # TODO: Figure how to pass down position_ids_this_rank to LLMRequest.
             req = executor_request_to_llm_request(
                 req_id=req_item.id,
                 executor_request=req_item.request,
 
@@ -562,8 +562,6 @@ def warmup(self, resource_manager: ResourceManager) -> None:
         cp_type = self.mapping.cp_config.get('cp_type', None)
         if cp_type is not None:
             if cp_type in [CpType.ULYSSES, CpType.STAR]:
-                assert False, "cp_type must be HELIX for helix benchmarking."
-                print("[ModelEngine::warmup] EARLY RETURN since cp_type ", cp_type)
                 return
 
         self._run_torch_compile_warmup(resource_manager)
@@ -1059,14 +1057,10 @@ def _init_max_seq_len(self):
             # NOTE: py_executor_creator makes sure that the executor uses this
             # smaller value as its max_seq_len too.
             logger.warning(
-                f"\n*******************************************************\n"
-                f"Specified {self.max_seq_len=} is larger than what the model can support\n"
-                f"({inferred_max_seq_len}). NOT Setting max_seq_len to {inferred_max_seq_len}. "
-                f"ARE YOU SURE ABOUT THIS?\n"
-                f"*******************************************************\n"
+                f"Specified {self.max_seq_len=} is larger than what the model can support "
+                f"({inferred_max_seq_len}). Setting max_seq_len to {inferred_max_seq_len}. "
             )
-            # self.max_seq_len = inferred_max_seq_len
-            pass
+            self.max_seq_len = inferred_max_seq_len
 
     def _infer_max_seq_len_from_config(self) -> int:
 
@@ -2134,9 +2128,7 @@ def _prepare_tp_inputs_no_cache(
         attn_metadata.padded_num_tokens = padded_num_tokens if padded_num_tokens != num_tokens else None
 
         if self.enable_attention_dp:
-            all_rank_num_tokens = self.dist.allgather(
-                attn_metadata.num_tokens)
-            attn_metadata.all_rank_num_tokens = all_rank_num_tokens
+            attn_metadata.all_rank_num_tokens = attn_all_rank_num_tokens
 
         virtual_num_tokens = num_tokens
         if attn_metadata.padded_num_tokens is not None:
@@ -2195,9 +2187,7 @@ def _prepare_tp_inputs_no_cache(
                 spec_all_rank_num_tokens = [
                     item[1] for item in all_rank_num_tokens
                 ]
-                all_rank_num_seqs = [
-                    item[2] for item in all_rank_num_tokens
-                ]
+                all_rank_num_seqs = [item[2] for item in all_rank_num_tokens]
                 attn_metadata.all_rank_num_tokens = attn_all_rank_num_tokens
                 spec_metadata.all_rank_num_tokens = spec_all_rank_num_tokens
                 spec_metadata.all_rank_num_seqs = all_rank_num_seqs