Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ def _post_init(self):
self.override_name_from_config()
self.read_from_env()
self.read_model_config()
self.causal = not self.is_bidirectional

@property
def registry(self):
Expand Down Expand Up @@ -616,6 +617,17 @@ def _get_download_model(self, model_name, model_type="default"):
# TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
pass

@property
def is_bidirectional(self) -> bool:
"""Whether the model uses bidirectional (non-causal) attention.

EB5 (ERNIE5) models all use bidirectional mask attention.
Can also be controlled by setting `is_causal: false` in the model's config.json.
"""
if hasattr(self, "is_causal"):

This comment was marked as outdated.

return not bool(self.is_causal)
return ErnieArchitectures.is_ernie5_arch(getattr(self, "architectures", []))

def print(self):
"""
Print all configuration information.
Expand Down Expand Up @@ -2176,8 +2188,8 @@ def postprocess(self):
# It will hang when real batch_size < tp_size
self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size)

if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures):
# ernie5 model not support chunked_mm_input
if self.model_config.is_bidirectional:
# bidirectional mask models (e.g. EB5) do not support chunked_mm_input
self.cache_config.disable_chunked_mm_input = True

self.postprocess_devices_and_ports()
Expand Down
4 changes: 3 additions & 1 deletion fastdeploy/input/ernie4_5_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,9 @@ def get_max_video_tokens(self, seq_len: int) -> int:
min_pixels=self.video_min_pixels,
max_pixels=self.video_max_pixels,
)[1]
num_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
num_video_tokens = (self.max_frames * patches_h * patches_w) // (
self.spatial_conv_size**2 * self.temporal_conv_size
)
return min(num_video_tokens, seq_len)

def get_mm_max_tokens_per_item(
Expand Down
98 changes: 98 additions & 0 deletions fastdeploy/input/qwen3_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# limitations under the License.
"""

import math
import pickle
from typing import Any, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -143,6 +144,103 @@ def calc_one(thw):

return calc_one(grid_thw)

@staticmethod
def _closest_factor_pair(n: int):
"""Return (small, large) factor pair of n closest to a square.

Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo.
"""
for d in range(math.isqrt(n), 0, -1):
if n % d == 0:
return d, n // d
return 1, n

@staticmethod
def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int:
"""Compute the maximum post-merge token count achievable under *max_pixels*.

Aligns with vllm's ``get_image_size_with_most_features``.
See qwen_vl_processor/process.py for full description.
"""
unit = patch_size * merge_size
max_seq_len = max_pixels // (unit * unit)
for n in range(max_seq_len, 0, -1):
h, w = DataProcessor._closest_factor_pair(n)
if w / h <= 200:
return n
return 1

def get_max_image_tokens(self, seq_len: int = None) -> int:
"""Return the maximum number of tokens a single image can produce.

Uses the same algorithm as vllm's ``get_max_image_tokens`` in
``Qwen2VLProcessingInfo``.

Args:
seq_len: Optional upper cap (model's max_model_len).

Returns:
Maximum number of image tokens per item.
"""
num_tokens = self._max_tokens_for_pixels(
self.image_processor.max_pixels,
self.image_processor.patch_size,
self.image_processor.merge_size,
)
if seq_len is not None:
num_tokens = min(num_tokens, seq_len)
return num_tokens

def get_max_video_tokens(self, seq_len: int = None) -> int:
"""Return the maximum number of tokens a single video item can produce.

For Qwen3-VL, video frames are constrained by VIDEO_MAX_PIXELS
(128*28*28 ~ 768*28*28) rather than the image max_pixels.
Temporal padding follows the processor: frames are padded *up* to the
next multiple of ``temporal_patch_size`` (same as vllm line 868).

Args:
seq_len: Optional sequence length cap.

Returns:
Maximum number of video tokens per item.
"""
temporal_patch_size = self.image_processor.temporal_patch_size

# Video uses its own (tighter) pixel bounds
spatial_tokens = self._max_tokens_for_pixels(
VIDEO_MAX_PIXELS,
self.image_processor.patch_size,
self.image_processor.merge_size,
)

# Pad frames UP (vllm: padded = frames + frames % temporal_patch_size)
padded_frames = self.max_frames + self.max_frames % temporal_patch_size
grid_t = max(padded_frames // temporal_patch_size, 1)

num_tokens = grid_t * spatial_tokens
if seq_len is not None:
num_tokens = min(num_tokens, seq_len)
return num_tokens

def get_mm_max_tokens_per_item(self, seq_len: int = None):
"""Return max tokens per item for each active modality.

Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that
FastDeploy can compute encoder budgets without running dummy inputs.

Args:
seq_len: Model's maximum sequence length (used as an upper cap).

Returns:
Dict mapping modality name to max tokens, e.g.
``{"image": 1280, "video": 8192}``.
"""
return {
"image": self.get_max_image_tokens(seq_len),
"video": self.get_max_video_tokens(seq_len),
}

def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
"""
Convert text with image/video placeholders into model inputs.
Expand Down
15 changes: 15 additions & 0 deletions fastdeploy/input/qwen3_vl_processor/qwen3_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,18 @@ def pack_outputs(self, outputs):
outputs["mm_num_token_func"] = self.processor.mm_num_tokens

return outputs

def get_mm_max_tokens_per_item(self, seq_len: int = None):
"""Return max tokens per item for each modality (image / video).

Delegates to the inner DataProcessor so that upper layers
(common_engine, scheduler) can compute encoder budgets without
running dummy inputs through the model.

Args:
seq_len: Model's maximum sequence length (optional cap).

Returns:
Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``.
"""
return self.processor.get_mm_max_tokens_per_item(seq_len)
108 changes: 108 additions & 0 deletions fastdeploy/input/qwen_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,114 @@ def calc_one(thw):

return calc_one(grid_thw)

@staticmethod
def _closest_factor_pair(n: int):
"""Return (small, large) factor pair of n closest to a square.

Mirrors vllm's ``closest_factor_pair`` in Qwen2VLProcessingInfo.
"""
import math

This comment was marked as outdated.


for d in range(math.isqrt(n), 0, -1):
if n % d == 0:
return d, n // d
return 1, n

@staticmethod
def _max_tokens_for_pixels(max_pixels: int, patch_size: int, merge_size: int) -> int:
"""Compute the maximum post-merge token count achievable under *max_pixels*.

Aligns with vllm's ``get_image_size_with_most_features``:
1. ``max_seq_len = max_pixels // unit^2`` where ``unit = patch * merge``
is the number of *merged* tokens that can fit.
2. Find the largest ``seq_len <= max_seq_len`` whose factor pair has
aspect ratio <= 200 (the Qwen2-VL processor rejects extreme ratios).
3. Token count = ``height_factor * width_factor`` = ``seq_len``.

Using ``closest_factor_pair`` guarantees we never undercount when
``max_pixels`` is not a perfect square of ``unit``.
"""
unit = patch_size * merge_size
max_seq_len = max_pixels // (unit * unit)
for n in range(max_seq_len, 0, -1):
h, w = DataProcessor._closest_factor_pair(n)
if w / h <= 200:
return n
return 1

def get_max_image_tokens(self, seq_len: int = None) -> int:
"""Return the maximum number of tokens a single image can produce.

Uses the same algorithm as vllm's ``get_max_image_tokens`` in
``Qwen2VLProcessingInfo``: factorises the token budget from
``max_pixels`` and finds the best non-extreme aspect ratio.

Args:
seq_len: Optional upper cap (model's max_model_len).

Returns:
Maximum number of image tokens per item.
"""
num_tokens = self._max_tokens_for_pixels(
self.image_processor.max_pixels,
self.image_processor.patch_size,
self.image_processor.merge_size,
)
if seq_len is not None:
num_tokens = min(num_tokens, seq_len)
return num_tokens

def get_max_video_tokens(self, seq_len: int = None) -> int:
"""Return the maximum number of tokens a single video item can produce.

Mirrors vllm's ``get_max_video_tokens``:
- Spatial token budget same as image (``_max_tokens_for_pixels``).
- Temporal dimension: frames are *padded up* to the next multiple of
``temporal_patch_size`` (matching the processor behaviour in
qwen2_vl/image_processing_qwen2_vl.py line 294).

Args:
seq_len: Optional upper cap (model's max_model_len).

Returns:
Maximum number of video tokens per item.
"""
temporal_patch_size = self.image_processor.temporal_patch_size

spatial_tokens = self._max_tokens_for_pixels(
self.image_processor.max_pixels,
self.image_processor.patch_size,
self.image_processor.merge_size,
)

# Pad frames UP to next multiple of temporal_patch_size (vllm line 868):
# padded_num_frames = num_frames + num_frames % temporal_patch_size
padded_frames = self.max_frames + self.max_frames % temporal_patch_size
grid_t = max(padded_frames // temporal_patch_size, 1)

num_tokens = grid_t * spatial_tokens
if seq_len is not None:
num_tokens = min(num_tokens, seq_len)
return num_tokens

def get_mm_max_tokens_per_item(self, seq_len: int = None):
"""Return max tokens per item for each active modality.

Aligns with vllm's ``get_mm_max_tokens_per_item`` interface so that
FastDeploy can compute encoder budgets without running dummy inputs.

Args:
seq_len: Model's maximum sequence length (used as an upper cap).

Returns:
Dict mapping modality name to max tokens, e.g.
``{"image": 1280, "video": 8192}``.
"""
return {
"image": self.get_max_image_tokens(seq_len),
"video": self.get_max_video_tokens(seq_len),
}

def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
"""
Convert text with image/video placeholders into model inputs.
Expand Down
15 changes: 15 additions & 0 deletions fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,18 @@ def pack_outputs(self, outputs):

outputs["mm_num_token_func"] = self.processor.mm_num_tokens
return outputs

def get_mm_max_tokens_per_item(self, seq_len: int = None):
"""Return max tokens per item for each modality (image / video).

Delegates to the inner DataProcessor so that upper layers
(common_engine, scheduler) can compute encoder budgets without
running dummy inputs through the model.

Args:
seq_len: Model's maximum sequence length (optional cap).

Returns:
Dict[str, int]: e.g. ``{"image": 1280, "video": 8192}``.
"""
return self.processor.get_mm_max_tokens_per_item(seq_len)
1 change: 0 additions & 1 deletion fastdeploy/scheduler/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ def __init__(self, args):
"""
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

删除了 max_extra_num_batched_tokens 属性,但测试代码 tests/scheduler/test_scheduler_config.py 仍然使用该属性(第 138、157 行),这会导致测试失败。

请同步更新测试代码,删除或替换相关断言。

self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler
self.max_num_batched_tokens = 2048 # base token_num for text inputs
self.max_extra_num_batched_tokens = 16384 # extra token_num for multimodal inputs
self.max_num_seqs = 34

This comment was marked as outdated.

self.splitwise_role = "mixed"
self.enable_overlap_schedule = False
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/worker/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ def __init__(self, fd_config: FDConfig, target_model_input_batch: InputBatch) ->
self.cache_config: CacheConfig = fd_config.cache_config
self.speculative_config: SpeculativeConfig = fd_config.speculative_config
self.enable_pd_reorder: bool = False
self.max_chunk_tokens = fd_config.get_max_chunk_tokens(self.model_config.mm_max_tokens_per_item)

def init_share_inputs(self):
# share with targe model
Expand Down Expand Up @@ -752,7 +753,7 @@ def init_share_inputs(self):

self.target_hidden_states = paddle.full(
[
self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_extra_num_batched_tokens,
self.max_chunk_tokens,
self.model_config.hidden_size,
],
0,
Expand Down
Loading
Loading