fix

hiworldwzj · hiworldwzj · commit 5ac8bfa33cf6 · 2025-10-26T11:27:32.000+08:00
diff --git a/lightllm/common/basemodel/infer_struct.py b/lightllm/common/basemodel/infer_struct.py
@@ -147,6 +147,8 @@ def prefill_dp_balance(self, input_ids: torch.Tensor):
         assert self.is_prefill
         import torch.distributed as dist
 
+        self.need_dp_prefill_balance = True
+
         args = get_env_start_args()
 
         dp_input_lens = torch.empty(size=(args.dp,), device="cuda", dtype=torch.int32)
diff --git a/lightllm/distributed/communication_op.py b/lightllm/distributed/communication_op.py
@@ -63,7 +63,7 @@ def __init__(self):
         self.custom_gather = None
         self.dp_world_size = get_dp_world_size()
         self.device_group = create_new_group_for_current_dp("nccl")
-        if get_env_start_args().dp > 1 and get_env_start_args().enable_dp_prefill_balance:
+        if get_env_start_args().enable_dp_prefill_balance:
             self.dp_prefill_balance_group = create_dp_special_inter_group("nccl")
         else:
             self.dp_prefill_balance_group = None
diff --git a/lightllm/models/llama/layer_infer/post_layer_infer.py b/lightllm/models/llama/layer_infer/post_layer_infer.py
@@ -117,7 +117,7 @@ def tpsp_token_forward(
             # len(infer_state.position_sin) 获取真实输入长度
             input_embdings = gather_data[0 : len(infer_state.position_sin)]
 
-        if infer_state.is_prefill and get_env_start_args().enable_dp_prefill_balance:
+        if infer_state.need_dp_prefill_balance:
             input_embdings = infer_state._all_to_all_unbalance_get(data=input_embdings)
 
         return self.token_forward(input_embdings=input_embdings, infer_state=infer_state, layer_weight=layer_weight)
@@ -134,7 +134,7 @@ def overlap_tpsp_token_forward(
             infer_state.hook()
             infer_state.hook = None
 
-        if infer_state.is_prefill and get_env_start_args().enable_dp_prefill_balance:
+        if infer_state.need_dp_prefill_balance:
             input_embdings = infer_state._all_to_all_unbalance_get(data=input_embdings)
 
         logics = self.tpsp_token_forward(input_embdings, infer_state, layer_weight=layer_weight)
@@ -143,7 +143,7 @@ def overlap_tpsp_token_forward(
             infer_state1.hook()
             infer_state1.hook = None
 
-        if infer_state1.is_prefill and get_env_start_args().enable_dp_prefill_balance:
+        if infer_state1.need_dp_prefill_balance:
             input_embdings1 = infer_state1._all_to_all_unbalance_get(data=input_embdings1)
 
         logics1 = self.tpsp_token_forward(input_embdings1, infer_state1, layer_weight=layer_weight)
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -228,7 +228,7 @@ def _tpsp_get_qkv(
             infer_state.position_sin,
         )
 
-        if infer_state.is_prefill and get_env_start_args().enable_dp_prefill_balance:
+        if infer_state.need_dp_prefill_balance:
             q = infer_state._all_to_all_unbalance_get(data=q)
             cache_kv = infer_state._all_to_all_unbalance_get(data=cache_kv)
 
@@ -401,7 +401,7 @@ def _get_o(
     def _tpsp_get_o(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
     ) -> torch.Tensor:
-        if infer_state.is_prefill and get_env_start_args().enable_dp_prefill_balance:
+        if infer_state.need_dp_prefill_balance:
             input = infer_state._all_to_all_balance_get(data=input)
 
         input = input.view(-1, self.tp_o_head_num_ * self.head_dim_)
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -140,7 +140,7 @@ def normal_or_p_d_start(args):
         assert args.router_token_ratio == 0.0
 
     if args.enable_dp_prefill_balance:
-        assert args.enable_tpsp_mix_mode and args.dp > 1, "need set --enable_tpsp_mix_mode firstly"
+        assert args.enable_tpsp_mix_mode and args.dp > 1, "need set --enable_tpsp_mix_mode firstly and --dp > 1"
 
     # mtp params check
     if args.mtp_mode is not None: