PaddlePaddle · kevincheng2 · Apr 16, 2026 · Apr 16, 2026 · PaddlePaddle-bot · Apr 16, 2026
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -344,6 +344,7 @@ def _post_init(self):
         self.override_name_from_config()
         self.read_from_env()
         self.read_model_config()
+        self.causal = not self.is_bidirectional
 
     @property
     def registry(self):
@@ -616,6 +617,17 @@ def _get_download_model(self, model_name, model_type="default"):
         # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
         pass
 
+    @property
+    def is_bidirectional(self) -> bool:
+        """Whether the model uses bidirectional (non-causal) attention.
+
+        EB5 (ERNIE5) models all use bidirectional mask attention.
+        Can also be controlled by setting `is_causal: false` in the model's config.json.
+        """
+        if hasattr(self, "is_causal"):
+            return not bool(self.is_causal)
+        return ErnieArchitectures.is_ernie5_arch(getattr(self, "architectures", []))
+
     def print(self):
         """
         Print all configuration information.
@@ -2176,8 +2188,8 @@ def postprocess(self):
                 # It will hang when real batch_size < tp_size
                 self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size)
 
-        if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures):
-            # ernie5 model not support chunked_mm_input
+        if self.model_config.is_bidirectional:
+            # bidirectional mask models (e.g. EB5) do not support chunked_mm_input
             self.cache_config.disable_chunked_mm_input = True
 
         self.postprocess_devices_and_ports()

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -778,7 +778,9 @@ def get_max_video_tokens(self, seq_len: int) -> int:
             min_pixels=self.video_min_pixels,
             max_pixels=self.video_max_pixels,
         )[1]
-        num_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        num_video_tokens = (self.max_frames * patches_h * patches_w) // (
+            self.spatial_conv_size**2 * self.temporal_conv_size
+        )
         return min(num_video_tokens, seq_len)
 
     def get_mm_max_tokens_per_item(

diff --git a/fastdeploy/input/qwen3_vl_processor/process.py b/fastdeploy/input/qwen3_vl_processor/process.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 """
 
+import math
 import pickle
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -143,6 +144,103 @@ def calc_one(thw):
 
         return calc_one(grid_thw)
 
+    @staticmethod
+    def _closest_factor_pair(n: int):
+        """Return (small, large) factor pair of n closest to a square.
+
+        Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo.
+        """
+        for d in range(math.isqrt(n), 0, -1):
+            if n % d == 0:
+                return d, n // d
+        return 1, n
+
+    @staticmethod
+    def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int:
+        """Compute the maximum post-merge token count achievable under *max_pixels*.
+
+        Aligns with vllm's ``get_image_size_with_most_features``.
+        See qwen_vl_processor/process.py for full description.
+        """
+        unit = patch_size * merge_size
+        max_seq_len = max_pixels // (unit * unit)
+        for n in range(max_seq_len, 0, -1):
+            h, w = DataProcessor._closest_factor_pair(n)
+            if w / h <= 200:
+                return n
+        return 1
+
+    def get_max_image_tokens(self, seq_len: int = None) -> int:
+        """Return the maximum number of tokens a single image can produce.
+
+        Uses the same algorithm as vllm's ``get_max_image_tokens`` in
+        ``Qwen2VLProcessingInfo``.
+
+        Args:
+            seq_len: Optional upper cap (model's max_model_len).
+
+        Returns:
+            Maximum number of image tokens per item.
+        """
+        num_tokens = self._max_tokens_for_pixels(
+            self.image_processor.max_pixels,
+            self.image_processor.patch_size,
+            self.image_processor.merge_size,
+        )
+        if seq_len is not None:
+            num_tokens = min(num_tokens, seq_len)
+        return num_tokens
+
+    def get_max_video_tokens(self, seq_len: int = None) -> int:
+        """Return the maximum number of tokens a single video item can produce.
+
+        For Qwen3-VL, video frames are constrained by VIDEO_MAX_PIXELS
+        (128*28*28 ~ 768*28*28) rather than the image max_pixels.
+        Temporal padding follows the processor: frames are padded *up* to the
+        next multiple of ``temporal_patch_size`` (same as vllm line 868).
+
+        Args:
+            seq_len: Optional sequence length cap.
+
+        Returns:
+            Maximum number of video tokens per item.
+        """
+        temporal_patch_size = self.image_processor.temporal_patch_size
+
+        # Video uses its own (tighter) pixel bounds
+        spatial_tokens = self._max_tokens_for_pixels(
+            VIDEO_MAX_PIXELS,
+            self.image_processor.patch_size,
+            self.image_processor.merge_size,
+        )
+
+        # Pad frames UP (vllm: padded = frames + frames % temporal_patch_size)
+        padded_frames = self.max_frames + self.max_frames % temporal_patch_size
+        grid_t = max(padded_frames // temporal_patch_size, 1)
+
+        num_tokens = grid_t * spatial_tokens
+        if seq_len is not None:
+            num_tokens = min(num_tokens, seq_len)
+        return num_tokens
+
+    def get_mm_max_tokens_per_item(self, seq_len: int = None):
+        """Return max tokens per item for each active modality.
+
+        Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that
+        FastDeploy can compute encoder budgets without running dummy inputs.
+
+        Args:
+            seq_len: Model's maximum sequence length (used as an upper cap).
+
+        Returns:
+            Dict mapping modality name to max tokens, e.g.
+            ``{"image": 1280, "video": 8192}``.
+        """
+        return {
+            "image": self.get_max_image_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
         """
         Convert text with image/video placeholders into model inputs.

diff --git a/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py b/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py
@@ -315,3 +315,18 @@ def pack_outputs(self, outputs):
         outputs["mm_num_token_func"] = self.processor.mm_num_tokens
 
         return outputs
+
+    def get_mm_max_tokens_per_item(self, seq_len: int = None):
+        """Return max tokens per item for each modality (image / video).
+
+        Delegates to the inner DataProcessor so that upper layers
+        (common_engine, scheduler) can compute encoder budgets without
+        running dummy inputs through the model.
+
+        Args:
+            seq_len: Model's maximum sequence length (optional cap).
+
+        Returns:
+            Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``.
+        """
+        return self.processor.get_mm_max_tokens_per_item(seq_len)
diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py
@@ -140,6 +140,114 @@ def calc_one(thw):
 
         return calc_one(grid_thw)
 
+    @staticmethod
+    def _closest_factor_pair(n: int):
+        """Return (small, large) factor pair of n closest to a square.
+
+        Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo.
+        """
+        import math
+
+        for d in range(math.isqrt(n), 0, -1):
+            if n % d == 0:
+                return d, n // d
+        return 1, n
+
+    @staticmethod
+    def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int:
+        """Compute the maximum post-merge token count achievable under *max_pixels*.
+
+        Aligns with vllm's ``get_image_size_with_most_features``:
+        1. ``max_seq_len = max_pixels // unit^2``  where ``unit = patch * merge``
+           is the number of *merged* tokens that can fit.
+        2. Find the largest ``seq_len <= max_seq_len`` whose factor pair has
+           aspect ratio <= 200 (the Qwen2-VL processor rejects extreme ratios).
+        3. Token count = ``height_factor * width_factor`` = ``seq_len``.
+
+        Using ``closest_factor_pair`` guarantees we never undercount when
+        ``max_pixels`` is not a perfect square of ``unit``.
+        """
+        unit = patch_size * merge_size
+        max_seq_len = max_pixels // (unit * unit)
+        for n in range(max_seq_len, 0, -1):
+            h, w = DataProcessor._closest_factor_pair(n)
+            if w / h <= 200:
+                return n
+        return 1
+
+    def get_max_image_tokens(self, seq_len: int = None) -> int:
+        """Return the maximum number of tokens a single image can produce.
+
+        Uses the same algorithm as vllm's ``get_max_image_tokens`` in
+        ``Qwen2VLProcessingInfo``: factorises the token budget from
+        ``max_pixels`` and finds the best non-extreme aspect ratio.
+
+        Args:
+            seq_len: Optional upper cap (model's max_model_len).
+
+        Returns:
+            Maximum number of image tokens per item.
+        """
+        num_tokens = self._max_tokens_for_pixels(
+            self.image_processor.max_pixels,
+            self.image_processor.patch_size,
+            self.image_processor.merge_size,
+        )
+        if seq_len is not None:
+            num_tokens = min(num_tokens, seq_len)
+        return num_tokens
+
+    def get_max_video_tokens(self, seq_len: int = None) -> int:
+        """Return the maximum number of tokens a single video item can produce.
+
+        Mirrors vllm's ``get_max_video_tokens``:
+        - Spatial token budget same as image (``_max_tokens_for_pixels``).
+        - Temporal dimension: frames are *padded up* to the next multiple of
+          ``temporal_patch_size`` (matching the processor behaviour in
+          qwen2_vl/image_processing_qwen2_vl.py line 294).
+
+        Args:
+            seq_len: Optional upper cap (model's max_model_len).
+
+        Returns:
+            Maximum number of video tokens per item.
+        """
+        temporal_patch_size = self.image_processor.temporal_patch_size
+
+        spatial_tokens = self._max_tokens_for_pixels(
+            self.image_processor.max_pixels,
+            self.image_processor.patch_size,
+            self.image_processor.merge_size,
+        )
+
+        # Pad frames UP to next multiple of temporal_patch_size (vllm line 868):
+        #   padded_num_frames = num_frames + num_frames % temporal_patch_size
+        padded_frames = self.max_frames + self.max_frames % temporal_patch_size
+        grid_t = max(padded_frames // temporal_patch_size, 1)
+
+        num_tokens = grid_t * spatial_tokens
+        if seq_len is not None:
+            num_tokens = min(num_tokens, seq_len)
+        return num_tokens
+
+    def get_mm_max_tokens_per_item(self, seq_len: int = None):
+        """Return max tokens per item for each active modality.
+
+        Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that
+        FastDeploy can compute encoder budgets without running dummy inputs.
+
+        Args:
+            seq_len: Model's maximum sequence length (used as an upper cap).
+
+        Returns:
+            Dict mapping modality name to max tokens, e.g.
+            ``{"image": 1280, "video": 8192}``.
+        """
+        return {
+            "image": self.get_max_image_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
         """
         Convert text with image/video placeholders into model inputs.

diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
@@ -315,3 +315,18 @@ def pack_outputs(self, outputs):
 
         outputs["mm_num_token_func"] = self.processor.mm_num_tokens
         return outputs
+
+    def get_mm_max_tokens_per_item(self, seq_len: int = None):
+        """Return max tokens per item for each modality (image / video).
+
+        Delegates to the inner DataProcessor so that upper layers
+        (common_engine, scheduler) can compute encoder budgets without
+        running dummy inputs through the model.
+
+        Args:
+            seq_len: Model's maximum sequence length (optional cap).
+
+        Returns:
+            Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``.
+        """
+        return self.processor.get_mm_max_tokens_per_item(seq_len)
diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py
@@ -269,7 +269,6 @@ def __init__(self, args):
         """
         self.name = "local"  # "local" for LocalScheduler or "global" for GlobalScheduler
         self.max_num_batched_tokens = 2048  # base token_num for text inputs
-        self.max_extra_num_batched_tokens = 16384  # extra token_num for multimodal inputs
         self.max_num_seqs = 34
         self.splitwise_role = "mixed"
         self.enable_overlap_schedule = False

diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py
@@ -700,6 +700,7 @@ def __init__(self, fd_config: FDConfig, target_model_input_batch: InputBatch) ->
         self.cache_config: CacheConfig = fd_config.cache_config
         self.speculative_config: SpeculativeConfig = fd_config.speculative_config
         self.enable_pd_reorder: bool = False
+        self.max_chunk_tokens = fd_config.get_max_chunk_tokens(self.model_config.mm_max_tokens_per_item)
 
     def init_share_inputs(self):
         # share with targe model
@@ -752,7 +753,7 @@ def init_share_inputs(self):
 
         self.target_hidden_states = paddle.full(
             [
-                self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_extra_num_batched_tokens,
+                self.max_chunk_tokens,
                 self.model_config.hidden_size,
             ],
             0,