diff --git a/fastdeploy/config.py b/fastdeploy/config.py index b4bf00ceb12..233bd7de844 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -344,6 +344,7 @@ def _post_init(self): self.override_name_from_config() self.read_from_env() self.read_model_config() + self.causal = not self.is_bidirectional @property def registry(self): @@ -616,6 +617,17 @@ def _get_download_model(self, model_name, model_type="default"): # TODO: Provide dynamic graph for self-downloading and save to the specified download directory. pass + @property + def is_bidirectional(self) -> bool: + """Whether the model uses bidirectional (non-causal) attention. + + EB5 (ERNIE5) models all use bidirectional mask attention. + Can also be controlled by setting `is_causal: false` in the model's config.json. + """ + if hasattr(self, "is_causal"): + return not bool(self.is_causal) + return ErnieArchitectures.is_ernie5_arch(getattr(self, "architectures", [])) + def print(self): """ Print all configuration information. @@ -2176,8 +2188,8 @@ def postprocess(self): # It will hang when real batch_size < tp_size self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size) - if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures): - # ernie5 model not support chunked_mm_input + if self.model_config.is_bidirectional: + # bidirectional mask models (e.g. EB5) do not support chunked_mm_input self.cache_config.disable_chunked_mm_input = True self.postprocess_devices_and_ports() diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index ab87c562176..e2f745a5149 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -778,7 +778,9 @@ def get_max_video_tokens(self, seq_len: int) -> int: min_pixels=self.video_min_pixels, max_pixels=self.video_max_pixels, )[1] - num_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) + num_video_tokens = (self.max_frames * patches_h * patches_w) // ( + self.spatial_conv_size**2 * self.temporal_conv_size + ) return min(num_video_tokens, seq_len) def get_mm_max_tokens_per_item( diff --git a/fastdeploy/input/qwen3_vl_processor/process.py b/fastdeploy/input/qwen3_vl_processor/process.py index 994ec512911..a66ab28e351 100644 --- a/fastdeploy/input/qwen3_vl_processor/process.py +++ b/fastdeploy/input/qwen3_vl_processor/process.py @@ -15,6 +15,7 @@ # limitations under the License. """ +import math import pickle from typing import Any, Dict, List, Optional, Tuple, Union @@ -143,6 +144,103 @@ def calc_one(thw): return calc_one(grid_thw) + @staticmethod + def _closest_factor_pair(n: int): + """Return (small, large) factor pair of n closest to a square. + + Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo. + """ + for d in range(math.isqrt(n), 0, -1): + if n % d == 0: + return d, n // d + return 1, n + + @staticmethod + def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int: + """Compute the maximum post-merge token count achievable under *max_pixels*. + + Aligns with vllm's ``get_image_size_with_most_features``. + See qwen_vl_processor/process.py for full description. + """ + unit = patch_size * merge_size + max_seq_len = max_pixels // (unit * unit) + for n in range(max_seq_len, 0, -1): + h, w = DataProcessor._closest_factor_pair(n) + if w / h <= 200: + return n + return 1 + + def get_max_image_tokens(self, seq_len: int = None) -> int: + """Return the maximum number of tokens a single image can produce. + + Uses the same algorithm as vllm's ``get_max_image_tokens`` in + ``Qwen2VLProcessingInfo``. + + Args: + seq_len: Optional upper cap (model's max_model_len). + + Returns: + Maximum number of image tokens per item. + """ + num_tokens = self._max_tokens_for_pixels( + self.image_processor.max_pixels, + self.image_processor.patch_size, + self.image_processor.merge_size, + ) + if seq_len is not None: + num_tokens = min(num_tokens, seq_len) + return num_tokens + + def get_max_video_tokens(self, seq_len: int = None) -> int: + """Return the maximum number of tokens a single video item can produce. + + For Qwen3-VL, video frames are constrained by VIDEO_MAX_PIXELS + (128*28*28 ~ 768*28*28) rather than the image max_pixels. + Temporal padding follows the processor: frames are padded *up* to the + next multiple of ``temporal_patch_size`` (same as vllm line 868). + + Args: + seq_len: Optional sequence length cap. + + Returns: + Maximum number of video tokens per item. + """ + temporal_patch_size = self.image_processor.temporal_patch_size + + # Video uses its own (tighter) pixel bounds + spatial_tokens = self._max_tokens_for_pixels( + VIDEO_MAX_PIXELS, + self.image_processor.patch_size, + self.image_processor.merge_size, + ) + + # Pad frames UP (vllm: padded = frames + frames % temporal_patch_size) + padded_frames = self.max_frames + self.max_frames % temporal_patch_size + grid_t = max(padded_frames // temporal_patch_size, 1) + + num_tokens = grid_t * spatial_tokens + if seq_len is not None: + num_tokens = min(num_tokens, seq_len) + return num_tokens + + def get_mm_max_tokens_per_item(self, seq_len: int = None): + """Return max tokens per item for each active modality. + + Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that + FastDeploy can compute encoder budgets without running dummy inputs. + + Args: + seq_len: Model's maximum sequence length (used as an upper cap). + + Returns: + Dict mapping modality name to max tokens, e.g. + ``{"image": 1280, "video": 8192}``. + """ + return { + "image": self.get_max_image_tokens(seq_len), + "video": self.get_max_video_tokens(seq_len), + } + def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): """ Convert text with image/video placeholders into model inputs. diff --git a/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py b/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py index cc0110e1e1d..17bbb9027bd 100644 --- a/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py +++ b/fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py @@ -315,3 +315,18 @@ def pack_outputs(self, outputs): outputs["mm_num_token_func"] = self.processor.mm_num_tokens return outputs + + def get_mm_max_tokens_per_item(self, seq_len: int = None): + """Return max tokens per item for each modality (image / video). + + Delegates to the inner DataProcessor so that upper layers + (common_engine, scheduler) can compute encoder budgets without + running dummy inputs through the model. + + Args: + seq_len: Model's maximum sequence length (optional cap). + + Returns: + Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``. + """ + return self.processor.get_mm_max_tokens_per_item(seq_len) diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index a84fac7854e..c3d762aab39 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -140,6 +140,114 @@ def calc_one(thw): return calc_one(grid_thw) + @staticmethod + def _closest_factor_pair(n: int): + """Return (small, large) factor pair of n closest to a square. + + Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo. + """ + import math + + for d in range(math.isqrt(n), 0, -1): + if n % d == 0: + return d, n // d + return 1, n + + @staticmethod + def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int: + """Compute the maximum post-merge token count achievable under *max_pixels*. + + Aligns with vllm's ``get_image_size_with_most_features``: + 1. ``max_seq_len = max_pixels // unit^2`` where ``unit = patch * merge`` + is the number of *merged* tokens that can fit. + 2. Find the largest ``seq_len <= max_seq_len`` whose factor pair has + aspect ratio <= 200 (the Qwen2-VL processor rejects extreme ratios). + 3. Token count = ``height_factor * width_factor`` = ``seq_len``. + + Using ``closest_factor_pair`` guarantees we never undercount when + ``max_pixels`` is not a perfect square of ``unit``. + """ + unit = patch_size * merge_size + max_seq_len = max_pixels // (unit * unit) + for n in range(max_seq_len, 0, -1): + h, w = DataProcessor._closest_factor_pair(n) + if w / h <= 200: + return n + return 1 + + def get_max_image_tokens(self, seq_len: int = None) -> int: + """Return the maximum number of tokens a single image can produce. + + Uses the same algorithm as vllm's ``get_max_image_tokens`` in + ``Qwen2VLProcessingInfo``: factorises the token budget from + ``max_pixels`` and finds the best non-extreme aspect ratio. + + Args: + seq_len: Optional upper cap (model's max_model_len). + + Returns: + Maximum number of image tokens per item. + """ + num_tokens = self._max_tokens_for_pixels( + self.image_processor.max_pixels, + self.image_processor.patch_size, + self.image_processor.merge_size, + ) + if seq_len is not None: + num_tokens = min(num_tokens, seq_len) + return num_tokens + + def get_max_video_tokens(self, seq_len: int = None) -> int: + """Return the maximum number of tokens a single video item can produce. + + Mirrors vllm's ``get_max_video_tokens``: + - Spatial token budget same as image (``_max_tokens_for_pixels``). + - Temporal dimension: frames are *padded up* to the next multiple of + ``temporal_patch_size`` (matching the processor behaviour in + qwen2_vl/image_processing_qwen2_vl.py line 294). + + Args: + seq_len: Optional upper cap (model's max_model_len). + + Returns: + Maximum number of video tokens per item. + """ + temporal_patch_size = self.image_processor.temporal_patch_size + + spatial_tokens = self._max_tokens_for_pixels( + self.image_processor.max_pixels, + self.image_processor.patch_size, + self.image_processor.merge_size, + ) + + # Pad frames UP to next multiple of temporal_patch_size (vllm line 868): + # padded_num_frames = num_frames + num_frames % temporal_patch_size + padded_frames = self.max_frames + self.max_frames % temporal_patch_size + grid_t = max(padded_frames // temporal_patch_size, 1) + + num_tokens = grid_t * spatial_tokens + if seq_len is not None: + num_tokens = min(num_tokens, seq_len) + return num_tokens + + def get_mm_max_tokens_per_item(self, seq_len: int = None): + """Return max tokens per item for each active modality. + + Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that + FastDeploy can compute encoder budgets without running dummy inputs. + + Args: + seq_len: Model's maximum sequence length (used as an upper cap). + + Returns: + Dict mapping modality name to max tokens, e.g. + ``{"image": 1280, "video": 8192}``. + """ + return { + "image": self.get_max_image_tokens(seq_len), + "video": self.get_max_video_tokens(seq_len), + } + def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): """ Convert text with image/video placeholders into model inputs. diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index 88bc5c76938..06f0f597edb 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -315,3 +315,18 @@ def pack_outputs(self, outputs): outputs["mm_num_token_func"] = self.processor.mm_num_tokens return outputs + + def get_mm_max_tokens_per_item(self, seq_len: int = None): + """Return max tokens per item for each modality (image / video). + + Delegates to the inner DataProcessor so that upper layers + (common_engine, scheduler) can compute encoder budgets without + running dummy inputs through the model. + + Args: + seq_len: Model's maximum sequence length (optional cap). + + Returns: + Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``. + """ + return self.processor.get_mm_max_tokens_per_item(seq_len) diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py index 1422b2635f3..86bc504f482 100644 --- a/fastdeploy/scheduler/config.py +++ b/fastdeploy/scheduler/config.py @@ -269,7 +269,6 @@ def __init__(self, args): """ self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler self.max_num_batched_tokens = 2048 # base token_num for text inputs - self.max_extra_num_batched_tokens = 16384 # extra token_num for multimodal inputs self.max_num_seqs = 34 self.splitwise_role = "mixed" self.enable_overlap_schedule = False diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py index 55a3f39a2ee..9b819cb69b0 100644 --- a/fastdeploy/worker/input_batch.py +++ b/fastdeploy/worker/input_batch.py @@ -700,6 +700,7 @@ def __init__(self, fd_config: FDConfig, target_model_input_batch: InputBatch) -> self.cache_config: CacheConfig = fd_config.cache_config self.speculative_config: SpeculativeConfig = fd_config.speculative_config self.enable_pd_reorder: bool = False + self.max_chunk_tokens = fd_config.get_max_chunk_tokens(self.model_config.mm_max_tokens_per_item) def init_share_inputs(self): # share with targe model @@ -752,7 +753,7 @@ def init_share_inputs(self): self.target_hidden_states = paddle.full( [ - self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_extra_num_batched_tokens, + self.max_chunk_tokens, self.model_config.hidden_size, ], 0, diff --git a/tests/input/test_mm_max_tokens_per_item.py b/tests/input/test_mm_max_tokens_per_item.py new file mode 100644 index 00000000000..735dac0921f --- /dev/null +++ b/tests/input/test_mm_max_tokens_per_item.py @@ -0,0 +1,700 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Unit tests for get_max_image_tokens / get_max_video_tokens / +get_mm_max_tokens_per_item across all multimodal processor DataProcessors and +their outer VLProcessor wrappers. + +Tests run without real model weights: the ImageProcessor and tokenizer are +mocked to avoid loading any files. +""" + +import unittest +from unittest.mock import MagicMock, patch + +# --------------------------------------------------------------------------- +# Helpers to build a minimal mock ImageProcessor matching the real interface +# --------------------------------------------------------------------------- + + +def _make_mock_image_processor( + min_pixels: int = 4 * 28 * 28, + max_pixels: int = 16384 * 28 * 28, + patch_size: int = 14, + merge_size: int = 2, + temporal_patch_size: int = 2, +): + """Return a MagicMock that behaves like + fastdeploy.input.image_processors.qwen_processor.ImageProcessor. + """ + ip = MagicMock() + ip.min_pixels = min_pixels + ip.max_pixels = max_pixels + ip.patch_size = patch_size + ip.merge_size = merge_size + ip.temporal_patch_size = temporal_patch_size + return ip + + +# --------------------------------------------------------------------------- +# Reference formula (mirrors vllm's get_image_size_with_most_features) +# --------------------------------------------------------------------------- + +import math + + +def _closest_factor_pair(n): + for d in range(math.isqrt(n), 0, -1): + if n % d == 0: + return d, n // d + return 1, n + + +def _max_tokens_for_pixels(max_pixels, patch_size, merge_size): + """Mirrors DataProcessor._max_tokens_for_pixels (vllm-aligned).""" + unit = patch_size * merge_size + max_seq_len = max_pixels // (unit * unit) + for n in range(max_seq_len, 0, -1): + h, w = _closest_factor_pair(n) + if w / h <= 200: + return n + return 1 + + +def _expected_image_tokens(max_pixels, patch_size, merge_size, seq_len=None): + n = _max_tokens_for_pixels(max_pixels, patch_size, merge_size) + return min(n, seq_len) if seq_len is not None else n + + +def _expected_video_tokens(max_pixels, patch_size, merge_size, temporal_patch_size, max_frames, seq_len=None): + """Mirrors get_max_video_tokens: frames padded UP to temporal_patch_size.""" + spatial = _max_tokens_for_pixels(max_pixels, patch_size, merge_size) + padded_frames = max_frames + max_frames % temporal_patch_size + grid_t = max(padded_frames // temporal_patch_size, 1) + n = grid_t * spatial + return min(n, seq_len) if seq_len is not None else n + + +# =========================================================================== +# Tests for qwen_vl_processor DataProcessor +# =========================================================================== + + +class TestQwenVLDataProcessor(unittest.TestCase): + """Tests for fastdeploy.input.qwen_vl_processor.process.DataProcessor""" + + def _make_processor(self, max_pixels=16384 * 28 * 28, max_frames=10): + """Build a DataProcessor with mocked heavy dependencies.""" + from fastdeploy.input.qwen_vl_processor.process import DataProcessor + + ip = _make_mock_image_processor(max_pixels=max_pixels) + + with ( + patch("fastdeploy.input.qwen_vl_processor.process.ImageProcessor.from_pretrained", return_value=ip), + patch("paddleformers.transformers.AutoTokenizer.from_pretrained", return_value=MagicMock()), + ): + proc = DataProcessor.__new__(DataProcessor) + proc.image_processor = ip + proc.spatial_conv_size = ip.merge_size + proc.temporal_conv_size = ip.temporal_patch_size + proc.max_frames = max_frames + proc.min_frames = 1 + return proc + + def test_get_max_image_tokens_default(self): + """get_max_image_tokens returns a positive integer.""" + proc = self._make_processor() + result = proc.get_max_image_tokens() + self.assertIsInstance(result, int) + self.assertGreater(result, 0) + + def test_get_max_image_tokens_matches_formula(self): + """get_max_image_tokens matches the reference formula.""" + max_pixels = 16384 * 28 * 28 + proc = self._make_processor(max_pixels=max_pixels) + expected = _expected_image_tokens( + max_pixels=max_pixels, + patch_size=proc.image_processor.patch_size, + merge_size=proc.image_processor.merge_size, + ) + self.assertEqual(proc.get_max_image_tokens(), expected) + + def test_get_max_image_tokens_seq_len_cap(self): + """seq_len caps the result when smaller than the formula value.""" + proc = self._make_processor() + uncapped = proc.get_max_image_tokens() + cap = uncapped // 2 + self.assertEqual(proc.get_max_image_tokens(seq_len=cap), cap) + + def test_get_max_image_tokens_seq_len_larger(self): + """seq_len larger than formula value has no effect.""" + proc = self._make_processor() + uncapped = proc.get_max_image_tokens() + self.assertEqual(proc.get_max_image_tokens(seq_len=uncapped * 2), uncapped) + + def test_get_max_video_tokens_default(self): + """get_max_video_tokens returns a positive integer.""" + proc = self._make_processor() + result = proc.get_max_video_tokens() + self.assertIsInstance(result, int) + self.assertGreater(result, 0) + + def test_get_max_video_tokens_matches_formula(self): + """get_max_video_tokens matches the reference formula.""" + max_pixels = 16384 * 28 * 28 + max_frames = 10 + proc = self._make_processor(max_pixels=max_pixels, max_frames=max_frames) + expected = _expected_video_tokens( + max_pixels=max_pixels, + patch_size=proc.image_processor.patch_size, + merge_size=proc.image_processor.merge_size, + temporal_patch_size=proc.image_processor.temporal_patch_size, + max_frames=max_frames, + ) + self.assertEqual(proc.get_max_video_tokens(), expected) + + def test_get_max_video_tokens_seq_len_cap(self): + """seq_len caps video tokens correctly.""" + proc = self._make_processor() + uncapped = proc.get_max_video_tokens() + cap = uncapped // 3 + self.assertEqual(proc.get_max_video_tokens(seq_len=cap), cap) + + def test_video_tokens_ge_image_tokens(self): + """A multi-frame video produces at least as many tokens as a single image.""" + proc = self._make_processor(max_frames=4) + self.assertGreaterEqual(proc.get_max_video_tokens(), proc.get_max_image_tokens()) + + def test_get_mm_max_tokens_per_item_returns_dict(self): + """get_mm_max_tokens_per_item returns a dict with 'image' and 'video'.""" + proc = self._make_processor() + result = proc.get_mm_max_tokens_per_item() + self.assertIn("image", result) + self.assertIn("video", result) + + def test_get_mm_max_tokens_per_item_values_consistent(self): + """Dict values equal the individual get_max_*_tokens results.""" + proc = self._make_processor() + result = proc.get_mm_max_tokens_per_item() + self.assertEqual(result["image"], proc.get_max_image_tokens()) + self.assertEqual(result["video"], proc.get_max_video_tokens()) + + def test_get_mm_max_tokens_per_item_seq_len_propagated(self): + """seq_len is propagated to both sub-calls.""" + proc = self._make_processor() + seq_len = 512 + result = proc.get_mm_max_tokens_per_item(seq_len=seq_len) + self.assertLessEqual(result["image"], seq_len) + self.assertLessEqual(result["video"], seq_len) + + +# =========================================================================== +# Tests for qwen3_vl_processor DataProcessor +# =========================================================================== + + +class TestQwen3VLDataProcessor(unittest.TestCase): + """Tests for fastdeploy.input.qwen3_vl_processor.process.DataProcessor""" + + # VIDEO_MAX_PIXELS from qwen3_vl_processor/process.py + VIDEO_MAX_PIXELS = 768 * 28 * 28 + + def _make_processor(self, max_pixels=16384 * 28 * 28, max_frames=10): + from fastdeploy.input.qwen3_vl_processor.process import DataProcessor + + ip = _make_mock_image_processor(max_pixels=max_pixels) + + proc = DataProcessor.__new__(DataProcessor) + proc.image_processor = ip + proc.spatial_conv_size = ip.merge_size + proc.temporal_conv_size = ip.temporal_patch_size + proc.max_frames = max_frames + proc.min_frames = 1 + return proc + + def test_get_max_image_tokens_matches_formula(self): + max_pixels = 16384 * 28 * 28 + proc = self._make_processor(max_pixels=max_pixels) + expected = _expected_image_tokens( + max_pixels=max_pixels, + patch_size=proc.image_processor.patch_size, + merge_size=proc.image_processor.merge_size, + ) + self.assertEqual(proc.get_max_image_tokens(), expected) + + def test_get_max_video_tokens_uses_video_max_pixels(self): + """Qwen3-VL video uses VIDEO_MAX_PIXELS, not image max_pixels.""" + proc = self._make_processor(max_pixels=16384 * 28 * 28, max_frames=10) + expected = _expected_video_tokens( + max_pixels=self.VIDEO_MAX_PIXELS, + patch_size=proc.image_processor.patch_size, + merge_size=proc.image_processor.merge_size, + temporal_patch_size=proc.image_processor.temporal_patch_size, + max_frames=proc.max_frames, + ) + self.assertEqual(proc.get_max_video_tokens(), expected) + + def test_get_max_video_tokens_lt_image_with_large_image_pixels(self): + """When image max_pixels >> VIDEO_MAX_PIXELS, video tokens < image tokens.""" + proc = self._make_processor(max_pixels=16384 * 28 * 28, max_frames=1) + # single frame video — still uses VIDEO_MAX_PIXELS so should be less + self.assertLessEqual(proc.get_max_video_tokens(), proc.get_max_image_tokens()) + + def test_seq_len_cap_image(self): + proc = self._make_processor() + cap = 100 + self.assertLessEqual(proc.get_max_image_tokens(seq_len=cap), cap) + + def test_seq_len_cap_video(self): + proc = self._make_processor() + cap = 100 + self.assertLessEqual(proc.get_max_video_tokens(seq_len=cap), cap) + + def test_get_mm_max_tokens_per_item_structure(self): + proc = self._make_processor() + result = proc.get_mm_max_tokens_per_item() + self.assertIn("image", result) + self.assertIn("video", result) + self.assertIsInstance(result["image"], int) + self.assertIsInstance(result["video"], int) + + def test_get_mm_max_tokens_per_item_consistency(self): + proc = self._make_processor() + result = proc.get_mm_max_tokens_per_item(seq_len=4096) + self.assertEqual(result["image"], proc.get_max_image_tokens(seq_len=4096)) + self.assertEqual(result["video"], proc.get_max_video_tokens(seq_len=4096)) + + +# =========================================================================== +# Tests for QwenVLProcessor wrapper (outer processor) +# =========================================================================== + + +class TestQwenVLProcessorWrapper(unittest.TestCase): + """Tests that QwenVLProcessor.get_mm_max_tokens_per_item delegates correctly.""" + + def _make_outer_processor(self, img_tokens=1280, vid_tokens=8192): + """Build a QwenVLProcessor whose inner DataProcessor is mocked.""" + from fastdeploy.input.qwen_vl_processor.qwen_vl_processor import QwenVLProcessor + + inner_proc = MagicMock() + inner_proc.get_mm_max_tokens_per_item.return_value = { + "image": img_tokens, + "video": vid_tokens, + } + outer = QwenVLProcessor.__new__(QwenVLProcessor) + outer.processor = inner_proc + return outer + + def test_delegates_to_inner_processor(self): + outer = self._make_outer_processor(img_tokens=1280, vid_tokens=8192) + result = outer.get_mm_max_tokens_per_item() + self.assertEqual(result["image"], 1280) + self.assertEqual(result["video"], 8192) + + def test_seq_len_forwarded(self): + outer = self._make_outer_processor() + outer.get_mm_max_tokens_per_item(seq_len=4096) + outer.processor.get_mm_max_tokens_per_item.assert_called_once_with(4096) + + def test_returns_dict(self): + outer = self._make_outer_processor() + result = outer.get_mm_max_tokens_per_item() + self.assertIsInstance(result, dict) + + +# =========================================================================== +# Tests for Qwen3VLProcessor wrapper (outer processor) +# =========================================================================== + + +class TestQwen3VLProcessorWrapper(unittest.TestCase): + """Tests that Qwen3VLProcessor.get_mm_max_tokens_per_item delegates correctly.""" + + def _make_outer_processor(self, img_tokens=2048, vid_tokens=4096): + from fastdeploy.input.qwen3_vl_processor.qwen3_vl_processor import ( + Qwen3VLProcessor, + ) + + inner_proc = MagicMock() + inner_proc.get_mm_max_tokens_per_item.return_value = { + "image": img_tokens, + "video": vid_tokens, + } + outer = Qwen3VLProcessor.__new__(Qwen3VLProcessor) + outer.processor = inner_proc + return outer + + def test_delegates_to_inner_processor(self): + outer = self._make_outer_processor(img_tokens=2048, vid_tokens=4096) + result = outer.get_mm_max_tokens_per_item() + self.assertEqual(result["image"], 2048) + self.assertEqual(result["video"], 4096) + + def test_seq_len_forwarded(self): + outer = self._make_outer_processor() + outer.get_mm_max_tokens_per_item(seq_len=8192) + outer.processor.get_mm_max_tokens_per_item.assert_called_once_with(8192) + + def test_returns_dict(self): + outer = self._make_outer_processor() + result = outer.get_mm_max_tokens_per_item() + self.assertIsInstance(result, dict) + + +# =========================================================================== +# Cross-processor sanity checks +# =========================================================================== + + +class TestCrossProcessorSanity(unittest.TestCase): + """Verify that QwenVL and Qwen3VL DataProcessors agree on the formula + when given identical ImageProcessor configs.""" + + def _make_qwen_vl(self, **kw): + from fastdeploy.input.qwen_vl_processor.process import DataProcessor as QDP + + ip = _make_mock_image_processor(**kw) + proc = QDP.__new__(QDP) + proc.image_processor = ip + proc.spatial_conv_size = ip.merge_size + proc.temporal_conv_size = ip.temporal_patch_size + proc.max_frames = 8 + proc.min_frames = 1 + return proc + + def _make_qwen3_vl(self, **kw): + from fastdeploy.input.qwen3_vl_processor.process import DataProcessor as Q3DP + + ip = _make_mock_image_processor(**kw) + proc = Q3DP.__new__(Q3DP) + proc.image_processor = ip + proc.spatial_conv_size = ip.merge_size + proc.temporal_conv_size = ip.temporal_patch_size + proc.max_frames = 8 + proc.min_frames = 1 + return proc + + def test_image_tokens_same_config(self): + """With identical image_processor config, image token counts match.""" + cfg = dict(max_pixels=4096 * 28 * 28, patch_size=14, merge_size=2) + qwen = self._make_qwen_vl(**cfg) + qwen3 = self._make_qwen3_vl(**cfg) + self.assertEqual(qwen.get_max_image_tokens(), qwen3.get_max_image_tokens()) + + def test_qwen3_video_uses_tighter_pixel_bound(self): + """Qwen3VL video tokens <= QwenVL video tokens (tighter pixel bound).""" + cfg = dict(max_pixels=16384 * 28 * 28, patch_size=14, merge_size=2, temporal_patch_size=2) + qwen = self._make_qwen_vl(**cfg) + qwen3 = self._make_qwen3_vl(**cfg) + # Qwen3 uses VIDEO_MAX_PIXELS=768*28*28 vs QwenVL's image max_pixels + self.assertLessEqual(qwen3.get_max_video_tokens(), qwen.get_max_video_tokens()) + + +# =========================================================================== +# Alignment tests: verify our formula matches vllm exactly +# =========================================================================== + + +class TestVllmAlignment(unittest.TestCase): + """Verify that our implementation matches vllm's formula on known cases.""" + + def _make_qwen_vl(self, max_pixels, max_frames): + from fastdeploy.input.qwen_vl_processor.process import DataProcessor as QDP + + ip = _make_mock_image_processor(max_pixels=max_pixels) + proc = QDP.__new__(QDP) + proc.image_processor = ip + proc.max_frames = max_frames + proc.min_frames = 1 + return proc + + def _make_qwen3_vl(self, max_pixels, max_frames): + from fastdeploy.input.qwen3_vl_processor.process import DataProcessor as Q3DP + + ip = _make_mock_image_processor(max_pixels=max_pixels) + proc = Q3DP.__new__(Q3DP) + proc.image_processor = ip + proc.max_frames = max_frames + proc.min_frames = 1 + return proc + + # --- image token tests for several non-square max_pixels values --- + + def test_image_tokens_non_square_1280(self): + """1280*28*28 is not a perfect square of unit — old formula underestimated.""" + max_pixels = 1280 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=1) + expected = _expected_image_tokens(max_pixels, patch_size=14, merge_size=2) + self.assertEqual(proc.get_max_image_tokens(), expected) + self.assertEqual(expected, 1280) # vllm ground truth + + def test_image_tokens_non_square_2048(self): + max_pixels = 2048 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=1) + expected = _expected_image_tokens(max_pixels, patch_size=14, merge_size=2) + self.assertEqual(proc.get_max_image_tokens(), expected) + self.assertEqual(expected, 2048) + + def test_image_tokens_non_square_1000(self): + max_pixels = 1000 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=1) + expected = _expected_image_tokens(max_pixels, patch_size=14, merge_size=2) + self.assertEqual(proc.get_max_image_tokens(), expected) + self.assertEqual(expected, 1000) + + def test_image_tokens_default_16384(self): + """Default max_pixels=16384*28*28 is a perfect square — both formulas agree.""" + max_pixels = 16384 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=1) + self.assertEqual(proc.get_max_image_tokens(), 16384) + + # --- video temporal padding tests --- + + def test_video_odd_frames_padded_up(self): + """Odd frame counts are padded UP (not truncated down) to temporal_patch_size.""" + # max_frames=9, temporal_patch_size=2 → padded=10, grid_t=5 (not 4) + max_pixels = 1280 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=9) + expected = _expected_video_tokens( + max_pixels, + patch_size=14, + merge_size=2, + temporal_patch_size=2, + max_frames=9, + ) + self.assertEqual(proc.get_max_video_tokens(), expected) + # Verify padding direction: padded_frames=10, grid_t=5, not 4 + spatial = _max_tokens_for_pixels(max_pixels, 14, 2) + self.assertEqual(expected, 5 * spatial) + + def test_video_exact_multiple_frames(self): + """Even frame count (exact multiple) should give grid_t = max_frames / temporal.""" + max_pixels = 1280 * 28 * 28 + proc = self._make_qwen_vl(max_pixels=max_pixels, max_frames=10) + expected = _expected_video_tokens( + max_pixels, + patch_size=14, + merge_size=2, + temporal_patch_size=2, + max_frames=10, + ) + self.assertEqual(proc.get_max_video_tokens(), expected) + spatial = _max_tokens_for_pixels(max_pixels, 14, 2) + self.assertEqual(expected, 5 * spatial) + + def test_qwen3_video_non_square_pixels(self): + """Qwen3-VL video uses VIDEO_MAX_PIXELS=768*28*28, also non-square.""" + from fastdeploy.input.qwen3_vl_processor.process import VIDEO_MAX_PIXELS + + proc = self._make_qwen3_vl(max_pixels=16384 * 28 * 28, max_frames=10) + expected = _expected_video_tokens( + VIDEO_MAX_PIXELS, + patch_size=14, + merge_size=2, + temporal_patch_size=2, + max_frames=10, + ) + self.assertEqual(proc.get_max_video_tokens(), expected) + self.assertEqual(expected, 5 * _max_tokens_for_pixels(VIDEO_MAX_PIXELS, 14, 2)) + + +# =========================================================================== +# Tests for ernie4_5_vl_processor DataProcessor +# =========================================================================== + + +def _make_mock_adaptive_image_preprocessor( + patch_size: int = 28, + merge_size: int = 2, + temporal_patch_size: int = 2, + image_max_pixels: int = 6177 * 28 * 28, + video_max_pixels: int = 1196 * 28 * 28, +): + """Return a mock AdaptiveImageProcessor matching ERNIE4.5-VL's interface.""" + ip = MagicMock() + ip.patch_size = patch_size + ip.merge_size = merge_size + ip.temporal_patch_size = temporal_patch_size + + def _get_smarted_resize(height, width, min_pixels, max_pixels): + """Minimal resize that preserves aspect ratio within pixel budget.""" + + area = height * width + if area > max_pixels: + scale = (max_pixels / area) ** 0.5 + height = int(height * scale // patch_size) * patch_size + width = int(width * scale // patch_size) * patch_size + elif area < min_pixels: + scale = (min_pixels / area) ** 0.5 + height = max(int(height * scale // patch_size) * patch_size, patch_size) + width = max(int(width * scale // patch_size) * patch_size, patch_size) + patches_h = height // patch_size + patches_w = width // patch_size + return (height, width), (patches_h, patches_w) + + ip.get_smarted_resize.side_effect = _get_smarted_resize + return ip + + +def _expected_ernie45_image_tokens(image_preprocessor, spatial_conv_size, image_min_pixels, image_max_pixels, seq_len): + """Reference: mirrors ERNIE4.5-VL DataProcessor.get_max_image_tokens.""" + from fastdeploy.input.utils import MAX_IMAGE_DIMENSION + + _, (patches_h, patches_w) = image_preprocessor.get_smarted_resize( + height=MAX_IMAGE_DIMENSION, + width=MAX_IMAGE_DIMENSION, + min_pixels=image_min_pixels, + max_pixels=image_max_pixels, + ) + # Second call with same target + _, (patches_h, patches_w) = image_preprocessor.get_smarted_resize( + height=patches_h * image_preprocessor.patch_size, + width=patches_w * image_preprocessor.patch_size, + min_pixels=image_min_pixels, + max_pixels=image_max_pixels, + ) + num_tokens = (patches_h * patches_w) // (spatial_conv_size**2) + return min(num_tokens, seq_len) + + +def _expected_ernie45_video_tokens( + image_preprocessor, + spatial_conv_size, + temporal_conv_size, + image_min_pixels, + image_max_pixels, + video_min_pixels, + video_max_pixels, + max_frames, + seq_len, +): + """Reference: mirrors ERNIE4.5-VL DataProcessor.get_max_video_tokens (fixed).""" + from fastdeploy.input.utils import MAX_IMAGE_DIMENSION + + # get_image_size_with_most_features uses image pixels + resized_h, resized_w = image_preprocessor.get_smarted_resize( + height=MAX_IMAGE_DIMENSION, + width=MAX_IMAGE_DIMENSION, + min_pixels=image_min_pixels, + max_pixels=image_max_pixels, + )[0] + _, (patches_h, patches_w) = image_preprocessor.get_smarted_resize( + height=resized_h, + width=resized_w, + min_pixels=video_min_pixels, + max_pixels=video_max_pixels, + ) + # max_frames is the temporal dimension (whole video) + num_tokens = (max_frames * patches_h * patches_w) // (spatial_conv_size**2 * temporal_conv_size) + return min(num_tokens, seq_len) + + +class TestErnie45VLDataProcessor(unittest.TestCase): + """Tests for fastdeploy.input.ernie4_5_vl_processor.process.DataProcessor""" + + DEFAULT_IMAGE_MAX_PIXELS = 6177 * 28 * 28 + DEFAULT_VIDEO_MAX_PIXELS = 1196 * 28 * 28 + DEFAULT_IMAGE_MIN_PIXELS = 4 * 28 * 28 + DEFAULT_VIDEO_MIN_PIXELS = 299 * 28 * 28 + + def _make_processor( + self, max_frames=180, spatial_conv_size=2, temporal_conv_size=2, image_max_pixels=None, video_max_pixels=None + ): + from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor + + ip = _make_mock_adaptive_image_preprocessor( + image_max_pixels=image_max_pixels or self.DEFAULT_IMAGE_MAX_PIXELS, + video_max_pixels=video_max_pixels or self.DEFAULT_VIDEO_MAX_PIXELS, + merge_size=spatial_conv_size, + temporal_patch_size=temporal_conv_size, + ) + + proc = DataProcessor.__new__(DataProcessor) + proc.image_preprocessor = ip + proc.spatial_conv_size = spatial_conv_size + proc.temporal_conv_size = temporal_conv_size + proc.image_min_pixels = self.DEFAULT_IMAGE_MIN_PIXELS + proc.image_max_pixels = image_max_pixels or self.DEFAULT_IMAGE_MAX_PIXELS + proc.video_min_pixels = self.DEFAULT_VIDEO_MIN_PIXELS + proc.video_max_pixels = video_max_pixels or self.DEFAULT_VIDEO_MAX_PIXELS + proc.max_frames = max_frames + return proc + + def test_get_max_image_tokens_positive(self): + """get_max_image_tokens returns a positive integer.""" + proc = self._make_processor() + result = proc.get_max_image_tokens(seq_len=32768) + self.assertIsInstance(result, int) + self.assertGreater(result, 0) + + def test_get_max_image_tokens_seq_len_cap(self): + """seq_len caps the result.""" + proc = self._make_processor() + result = proc.get_max_image_tokens(seq_len=10) + self.assertLessEqual(result, 10) + + def test_get_max_video_tokens_includes_frames(self): + """Video tokens must scale with max_frames (bug check: frames must be included).""" + proc_few = self._make_processor(max_frames=1) + proc_many = self._make_processor(max_frames=10) + few = proc_few.get_max_video_tokens(seq_len=100000) + many = proc_many.get_max_video_tokens(seq_len=100000) + # With 10x more frames, token count must be larger + self.assertGreater(many, few) + + def test_get_max_video_tokens_proportional_to_frames(self): + """With same spatial budget, doubling frames doubles video tokens.""" + proc_10 = self._make_processor(max_frames=10) + proc_20 = self._make_processor(max_frames=20) + t10 = proc_10.get_max_video_tokens(seq_len=1000000) + t20 = proc_20.get_max_video_tokens(seq_len=1000000) + self.assertEqual(t20, t10 * 2) + + def test_get_max_video_tokens_seq_len_cap(self): + """seq_len caps video tokens correctly.""" + proc = self._make_processor() + result = proc.get_max_video_tokens(seq_len=100) + self.assertLessEqual(result, 100) + + def test_get_mm_max_tokens_per_item_structure(self): + """Returns dict with 'image' and 'video' keys.""" + proc = self._make_processor() + result = proc.get_mm_max_tokens_per_item(seq_len=32768) + self.assertIn("image", result) + self.assertIn("video", result) + + def test_get_mm_max_tokens_per_item_consistency(self): + """Dict values match individual get_max_*_tokens results.""" + proc = self._make_processor() + seq_len = 32768 + result = proc.get_mm_max_tokens_per_item(seq_len=seq_len) + self.assertEqual(result["image"], proc.get_max_image_tokens(seq_len=seq_len)) + self.assertEqual(result["video"], proc.get_max_video_tokens(seq_len=seq_len)) + + def test_video_tokens_gt_image_tokens_with_many_frames(self): + """Multi-frame video should produce more tokens than a single image + when video and image share the same pixel budget.""" + # Use identical pixel budget for image and video so spatial resolution is equal, + # then more frames means more tokens. + shared_pixels = self.DEFAULT_VIDEO_MAX_PIXELS + proc = self._make_processor(max_frames=10, image_max_pixels=shared_pixels, video_max_pixels=shared_pixels) + img_tokens = proc.get_max_image_tokens(seq_len=1000000) + vid_tokens = proc.get_max_video_tokens(seq_len=1000000) + self.assertGreater(vid_tokens, img_tokens) + + +if __name__ == "__main__": + unittest.main()