save intermediate changes

brb-nv · brb-nv · commit 6ef5901a2832 · 2025-11-17T11:00:51.000-08:00
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -181,8 +181,9 @@ class Runner : public RunnerBase
         [[maybe_unused]] MlaParams<T> mla_params;
         if (op.isMLAEnabled())
         {
-            TORCH_CHECK(mla_tensor_params.size() == 1,
-                "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");
+            std::cerr << "[attentionOp::run] mla_tensor_params.size() = " << mla_tensor_params.size() << std::endl;
+            TORCH_CHECK(mla_tensor_params.size() == 2,
+                "Expecting 2 tensor for custom MLA tensor params: helix_position_offsets and helix_inactive_rank.");
             if (is_context && op.mUseSparseAttention)
             {
                 if (latent_cache.has_value())
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -1382,7 +1382,6 @@ def forward(
         softmax_stats_tensor: Optional[torch.Tensor] = None,
         helix_position_offsets: Optional[torch.Tensor] = None,
         enable_attn_nvfp4_output: bool = True,
-        helix_position_offsets: Optional[torch.Tensor] = None,
         output: Optional[torch.Tensor] = None,
         output_sf: Optional[torch.Tensor] = None,
         attention_sinks: Optional[torch.Tensor] = None,
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -1,3 +1,4 @@
+import copy
 import math
 import pickle  # nosec B403
 from abc import ABC, abstractmethod
@@ -359,8 +360,8 @@ def __init__(self, mapping: Mapping):
                 moe_cluster_size=self.mapping.moe_cluster_size,
                 moe_tp_size=self.mapping.moe_tp_size,
                 moe_ep_size=self.mapping.moe_ep_size,
-                attn_tp_size=self.mapping.attn_tp_size,
-                attn_cp_size=self.mapping.attn_cp_size,
+                # attn_tp_size=self.mapping.attn_tp_size,
+                # attn_cp_size=self.mapping.attn_cp_size,
                 enable_attention_dp=self.mapping.enable_attention_dp,
                 enable_lm_head_tp_in_adp=self.mapping.enable_lm_head_tp_in_adp)
             self.mapping = mapping_without_helix
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -1584,7 +1584,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
         # affected by CP. For other layers, CP ranks are repurposed to TP. This shall be undone
         # at the end of __init__.
         if model_config.mapping.cp_size > 1:
-            logger.info(
+            print(
                 f"[DeepseekV3ForCausalLM::__init__] Repurposing KVP ranks to TP while keeping other details the same."
             )
             self.mapping_with_cp = copy.deepcopy(model_config.mapping)
@@ -1603,7 +1603,6 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
                 tp_size=original_tp_size * original_cp_size,
                 pp_size=model_config.mapping.pp_size,
                 moe_ep_size=model_config.mapping.moe_ep_size,
-                auto_parallel=model_config.mapping.auto_parallel,
                 enable_attention_dp=model_config.mapping.enable_attention_dp)
             model_config._frozen = True
         ###############################################################################
@@ -1658,7 +1657,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
         ###############################################################################
         # Undo any manipulations done to mapping.
         if self.mapping_with_cp is not None:
-            logger.info(
+            print(
                 f"[DeepseekV3ForCausalLM::__init__] Restoring original mapping."
             )
             model_config._frozen = False
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -8,7 +8,7 @@
 from tensorrt_llm._utils import (get_sm_version, is_sm_100f, nvtx_range,
                                  nvtx_range_debug)
 from tensorrt_llm.logger import logger
-from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.mapping import Mapping, CpType
 
 from ..attention_backend import (AttentionInputType, AttentionMetadata,
                                  FlashInferAttentionMetadata, TrtllmAttention,
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -67,7 +67,7 @@ def __init__(
         #################################################################
         # TODO: Remove this hardcoding.
         if cp_size > 1:
-            assert cp_type == CpType.HELIX
+            cp_config = {"cp_type": CpType.HELIX}
         #################################################################
         moe_world_size = tp_size if cp_type == CpType.ULYSSES else tp_size * cp_size
 

Original file line number	Diff line number	Diff line change
`@@ -181,8 +181,9 @@ class Runner : public RunnerBase`
`181`	`181`	`[[maybe_unused]] MlaParams<T> mla_params;`
`182`	`182`	`if (op.isMLAEnabled())`
`183`	`183`	`{`
`184`		`- TORCH_CHECK(mla_tensor_params.size() == 1,`
`185`		`- "Expecting 1 tensor for custom MLA tensor params: helix_position_offsets.");`
	`184`	`+ std::cerr << "[attentionOp::run] mla_tensor_params.size() = " << mla_tensor_params.size() << std::endl;`
	`185`	`+ TORCH_CHECK(mla_tensor_params.size() == 2,`
	`186`	`+ "Expecting 2 tensor for custom MLA tensor params: helix_position_offsets and helix_inactive_rank.");`
`186`	`187`	`if (is_context && op.mUseSparseAttention)`
`187`	`188`	`{`
`188`	`189`	`if (latent_cache.has_value())`