@@ -1604,16 +1604,16 @@ def previous_seq_slots_device():
16041604
16051605 # The order of requests in a batch: [context requests, generation requests]
16061606 # generation requests: ['requests that do not have previous batch', 'requests that already have previous batch', 'dummy requests']
1607- # 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
1608- # 2) 'requests that already have previous batch': previous iteration's requests.
1609- # 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
1607+ # 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
1608+ # 2) 'requests that already have previous batch': previous iteration's requests.
1609+ # 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
16101610 # Therefore, both of self.previous_pos_id_offsets_cuda and self.previous_kv_lens_offsets_cuda are also 3 segments.
1611- # For 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
1611+ # For 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
16121612 # Set these requests' previous_pos_id_offsets and previous_kv_lens_offsets to '0' to skip the value changes in _preprocess_inputs.
16131613 # Already set to '0' during initialization.
1614- # For 2) 'requests that already have previous batch': enable overlap scheduler.
1614+ # For 2) 'requests that already have previous batch': enable overlap scheduler.
16151615 # Set their previous_pos_id_offsets and previous_kv_lens_offsets according to new_tokens_lens_device and kv_len_offsets_device.
1616- # For 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
1616+ # For 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
16171617 # Already set to '0' during initialization.
16181618
16191619 num_extend_reqeust_wo_dummy = len (extend_requests ) - len (
@@ -2401,8 +2401,7 @@ def capture_postprocess_fn(inputs: Dict[str, Any]):
24012401 outputs = self .cuda_graph_runner .replay (key , inputs )
24022402 else :
24032403 with MoeLoadBalancerIterContext (moe_load_balancer ):
2404- outputs = self ._forward_step (
2405- inputs , gather_ids , gather_context_logits )
2404+ outputs = self .cuda_graph_runner .replay (key , inputs )
24062405
24072406 if self .forward_pass_callable is not None :
24082407 self .forward_pass_callable ()
0 commit comments