PaddlePaddle · RuohengMa · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/custom_ops/xpu_ops/src/ops/get_infer_param.cc b/custom_ops/xpu_ops/src/ops/get_infer_param.cc
diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
@@ -478,6 +478,28 @@ std::vector<paddle::Tensor> GetInferParam(
     const paddle::Tensor& seq_lens_decoder,
     const paddle::Tensor& seq_lens_this_time,
     const paddle::Tensor& block_tables,
+    paddle::Tensor& encoder_batch_map,
+    paddle::Tensor& decoder_batch_map,
+    paddle::Tensor& encoder_batch_idx,
+    paddle::Tensor& decoder_batch_idx,
+    paddle::Tensor& encoder_seq_lod,
+    paddle::Tensor& decoder_seq_lod,
+    paddle::Tensor& encoder_kv_lod,
+    paddle::Tensor& prefix_len,
+    paddle::Tensor& decoder_context_len,
+    paddle::Tensor& decoder_context_len_cache,
+    paddle::Tensor& prefix_block_tables,
+    paddle::Tensor& encoder_batch_map_cpu,
+    paddle::Tensor& decoder_batch_map_cpu,
+    paddle::Tensor& encoder_batch_idx_cpu,
+    paddle::Tensor& decoder_batch_idx_cpu,
+    paddle::Tensor& encoder_seq_lod_cpu,
+    paddle::Tensor& decoder_seq_lod_cpu,
+    paddle::Tensor& encoder_kv_lod_cpu,
+    paddle::Tensor& prefix_len_cpu,
+    paddle::Tensor& decoder_context_len_cpu,
+    paddle::Tensor& decoder_context_len_cache_cpu,
+    paddle::Tensor& len_info_cpu,
     int block_size,
     int num_speculative_tokens);
 
@@ -1052,6 +1074,28 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("seq_lens_decoder"),
         py::arg("seq_lens_this_time"),
         py::arg("block_tables"),
+        py::arg("encoder_batch_map"),
+        py::arg("decoder_batch_map"),
+        py::arg("encoder_batch_idx"),
+        py::arg("decoder_batch_idx"),
+        py::arg("encoder_seq_lod"),
+        py::arg("decoder_seq_lod"),
+        py::arg("encoder_kv_lod"),
+        py::arg("prefix_len"),
+        py::arg("decoder_context_len"),
+        py::arg("decoder_context_len_cache"),
+        py::arg("prefix_block_tables"),
+        py::arg("encoder_batch_map_cpu"),
+        py::arg("decoder_batch_map_cpu"),
+        py::arg("encoder_batch_idx_cpu"),
+        py::arg("decoder_batch_idx_cpu"),
+        py::arg("encoder_seq_lod_cpu"),
+        py::arg("decoder_seq_lod_cpu"),
+        py::arg("encoder_kv_lod_cpu"),
+        py::arg("prefix_len_cpu"),
+        py::arg("decoder_context_len_cpu"),
+        py::arg("decoder_context_len_cache_cpu"),
+        py::arg("len_info_cpu"),
         py::arg("block_size"),
         py::arg("num_speculative_tokens"),
         "Get infer parameters for block attention in XPU");

diff --git a/custom_ops/xpu_ops/test/test_adjust_batch_and_gather_next_token.py b/custom_ops/xpu_ops/test/test_adjust_batch_and_gather_next_token.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import paddle
+from utils import init_inplace_tensor
 
 from fastdeploy.model_executor.ops.xpu import (
     adjust_batch,
@@ -33,35 +34,87 @@ def _run_test_base(seq_lens_this_time_data, is_speculative):
     seq_lens_this_time = paddle.to_tensor(seq_lens_this_time_data, dtype="int32")
 
     bsz = seq_lens_this_time.shape[0]
-    cum_offsets = paddle.zeros(bsz, dtype="int32")
     block_table = paddle.arange(0, 56, dtype="int32").reshape((bsz, 8))
 
-    infer_params = get_infer_param(seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, block_table, 64)
-
     (
         encoder_batch_map,
         decoder_batch_map,
         encoder_batch_idx,
         decoder_batch_idx,
         encoder_seq_lod,
         decoder_seq_lod,
+        encoder_kv_lod,
+        prefix_len,
+        decoder_context_len,
+        decoder_context_len_cache,
+        prefix_block_tables,
+        encoder_batch_map_cpu,
+        decoder_batch_map_cpu,
+        encoder_batch_idx_cpu,
+        decoder_batch_idx_cpu,
+        encoder_seq_lod_cpu,
+        decoder_seq_lod_cpu,
+        encoder_kv_lod_cpu,
+        prefix_len_cpu,
+        decoder_context_len_cpu,
+        decoder_context_len_cache_cpu,
+        len_info_cpu,
+    ) = init_inplace_tensor(seq_lens_encoder.shape[0], block_table.shape)
+    (
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
         _,
         _,
         _,
         _,
         _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        slot_mapping_enc,
+        slot_mapping_dec,
+    ) = get_infer_param(
+        seq_lens_encoder,
+        seq_lens_decoder,
+        seq_lens_this_time,
+        block_table,
+        encoder_batch_map,
+        decoder_batch_map,
+        encoder_batch_idx,
+        decoder_batch_idx,
+        encoder_seq_lod,
+        decoder_seq_lod,
+        encoder_kv_lod,
+        prefix_len,
+        decoder_context_len,
+        decoder_context_len_cache,
+        prefix_block_tables,
         encoder_batch_map_cpu,
         decoder_batch_map_cpu,
         encoder_batch_idx_cpu,
         decoder_batch_idx_cpu,
         encoder_seq_lod_cpu,
         decoder_seq_lod_cpu,
-        _,
-        _,
-        _,
-        _,
+        encoder_kv_lod_cpu,
+        prefix_len_cpu,
+        decoder_context_len_cpu,
+        decoder_context_len_cache_cpu,
         len_info_cpu,
-    ) = infer_params
+        64,
+        0,
+    )
 
     token_num = seq_lens_this_time.sum().cpu().item()
     hidden_dim = 8192
@@ -72,7 +125,6 @@ def _run_test_base(seq_lens_this_time_data, is_speculative):
     # 测试 adjust_batch
     adjusted_output = adjust_batch(
         input_tensor,
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_idx,
@@ -88,7 +140,6 @@ def _run_test_base(seq_lens_this_time_data, is_speculative):
 
     adjusted_output_cpu = adjust_batch(
         input_tensor.cpu(),
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_idx,
@@ -110,7 +161,6 @@ def _run_test_base(seq_lens_this_time_data, is_speculative):
     # 测试 gather_next_token
     gather_out = gather_next_token(
         adjusted_output,
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_map,
@@ -126,7 +176,6 @@ def _run_test_base(seq_lens_this_time_data, is_speculative):
 
     gather_out_cpu = gather_next_token(
         adjusted_output.cpu(),
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_map,

diff --git a/custom_ops/xpu_ops/test/test_adjust_batch_and_recover_batch_sequence.py b/custom_ops/xpu_ops/test/test_adjust_batch_and_recover_batch_sequence.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import paddle
+from utils import init_inplace_tensor
 
 from fastdeploy.model_executor.ops.xpu import (
     adjust_batch,
@@ -33,32 +34,85 @@ def _run_test_base(seq_lens_this_time_data):
     cum_offsets = paddle.zeros(bsz, dtype="int32")
     block_table = paddle.arange(0, 56, dtype="int32").reshape((bsz, 8))
 
-    infer_params = get_infer_param(seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, block_table, 64)
-
     (
         encoder_batch_map,
         decoder_batch_map,
         encoder_batch_idx,
         decoder_batch_idx,
         encoder_seq_lod,
         decoder_seq_lod,
+        encoder_kv_lod,
+        prefix_len,
+        decoder_context_len,
+        decoder_context_len_cache,
+        prefix_block_tables,
+        encoder_batch_map_cpu,
+        decoder_batch_map_cpu,
+        encoder_batch_idx_cpu,
+        decoder_batch_idx_cpu,
+        encoder_seq_lod_cpu,
+        decoder_seq_lod_cpu,
+        encoder_kv_lod_cpu,
+        prefix_len_cpu,
+        decoder_context_len_cpu,
+        decoder_context_len_cache_cpu,
+        len_info_cpu,
+    ) = init_inplace_tensor(seq_lens_encoder.shape[0], block_table.shape)
+    (
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
         _,
         _,
         _,
         _,
         _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        _,
+        slot_mapping_enc,
+        slot_mapping_dec,
+    ) = get_infer_param(
+        seq_lens_encoder,
+        seq_lens_decoder,
+        seq_lens_this_time,
+        block_table,
+        encoder_batch_map,
+        decoder_batch_map,
+        encoder_batch_idx,
+        decoder_batch_idx,
+        encoder_seq_lod,
+        decoder_seq_lod,
+        encoder_kv_lod,
+        prefix_len,
+        decoder_context_len,
+        decoder_context_len_cache,
+        prefix_block_tables,
         encoder_batch_map_cpu,
         decoder_batch_map_cpu,
         encoder_batch_idx_cpu,
         decoder_batch_idx_cpu,
         encoder_seq_lod_cpu,
         decoder_seq_lod_cpu,
-        _,
-        _,
-        _,
-        _,
+        encoder_kv_lod_cpu,
+        prefix_len_cpu,
+        decoder_context_len_cpu,
+        decoder_context_len_cache_cpu,
         len_info_cpu,
-    ) = infer_params
+        64,
+        0,
+    )
 
     token_num = seq_lens_this_time.sum().cpu().item()
     hidden_dim = 8192
@@ -68,7 +122,6 @@ def _run_test_base(seq_lens_this_time_data):
     # test adjust_batch
     adjusted_output = adjust_batch(
         input_tensor,
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_idx,
@@ -84,7 +137,6 @@ def _run_test_base(seq_lens_this_time_data):
 
     adjusted_output_cpu = adjust_batch(
         input_tensor.cpu(),
-        cum_offsets,
         encoder_seq_lod,
         decoder_seq_lod,
         encoder_batch_idx,