Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 59 additions & 2 deletions renderers/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from __future__ import annotations

import base64
from io import BytesIO
import json
from typing import Any

Expand Down Expand Up @@ -40,6 +42,53 @@
)


def _is_image_part(item: Any) -> bool:
if not isinstance(item, dict):
return False
part_type = item.get("type")
if part_type is not None:
return part_type in {"image", "image_url"}
return bool(item.get("image")) or bool(item.get("image_url"))


def _is_video_part(item: Any) -> bool:
if not isinstance(item, dict):
return False
part_type = item.get("type")
if part_type is not None:
return part_type in {"video", "video_url"}
return bool(item.get("video")) or bool(item.get("video_url"))


def _load_pil_image(item: Any) -> Any:
if not _is_image_part(item):
raise TypeError(f"Expected image content part, got {item!r}")
if not isinstance(item, dict):
raise TypeError(f"Expected image content part, got {item!r}")

source = item.get("image") or item.get("image_url")
if isinstance(source, dict):
source = source.get("url")
if source is None:
raise TypeError(f"Image content part has no image payload: {item!r}")

try:
from PIL import Image
except ImportError as exc:
raise ImportError("Pillow is required to load image content parts") from exc

if hasattr(source, "convert"):
return source
if isinstance(source, bytes):
return Image.open(BytesIO(source)).convert("RGB")
if isinstance(source, str):
if source.startswith("data:image/"):
_, encoded = source.split(",", 1)
return Image.open(BytesIO(base64.b64decode(encoded))).convert("RGB")
return Image.open(source).convert("RGB")
raise TypeError(f"Unsupported image source {type(source).__name__!r}")


class Qwen3VLRenderer:
"""Deterministic message to token renderer for Qwen3-VL models (text-only)."""

Expand Down Expand Up @@ -90,8 +139,16 @@ def _render_text_content(content: Any) -> str:
for item in content:
if isinstance(item, str):
parts.append(item)
elif isinstance(item, dict) and "text" in item:
parts.append(item["text"])
elif isinstance(item, dict):
part_type = item.get("type")
if part_type == "text":
parts.append(item.get("text") or "")
elif _is_image_part(item) or _is_video_part(item):
continue
elif "text" in item:
parts.append(item.get("text") or "")
else:
raise ValueError(f"Unexpected content item: {item}")
else:
raise ValueError(f"Unexpected content item: {item}")
return "".join(parts)
Expand Down
74 changes: 74 additions & 0 deletions tests/test_qwen3_vl_content_parts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest

from renderers.qwen3_vl import (
Qwen3VLRenderer,
_is_image_part,
_is_video_part,
_load_pil_image,
)


class _FakeTokenizer:
unk_token_id = 0

_special_tokens = {
"<|im_start|>": 1,
"<|im_end|>": 2,
"<|endoftext|>": 3,
"<tool_call>": 4,
"</tool_call>": 5,
"<tool_response>": 6,
"</tool_response>": 7,
}

def convert_tokens_to_ids(self, token):
return self._special_tokens.get(token, self.unk_token_id)

def encode(self, text, add_special_tokens=False):
return [ord(ch) for ch in text]


def test_qwen3_vl_rejects_arrow_unified_none_media_keys():
content = [
{
"type": "text",
"text": "hello",
"image": None,
"image_url": None,
"video": None,
"video_url": None,
},
{
"type": "image_url",
"text": None,
"image": None,
"image_url": {
"url": "data:image/png;base64,"
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/"
"x8AAwMCAO+/p9sAAAAASUVORK5CYII="
},
"video": None,
"video_url": None,
},
]

renderer = Qwen3VLRenderer(_FakeTokenizer())

assert _is_image_part(content[0]) is False
assert _is_image_part(content[1]) is True
assert _is_video_part(content[0]) is False
renderer.render([{"role": "user", "content": content}])


def test_qwen3_vl_untyped_media_fallback_requires_truthy_payload():
assert _is_image_part({"type": "text", "image_url": {"url": "x"}}) is False
assert _is_image_part({"image_url": None}) is False
assert _is_image_part({"image_url": {"url": "data:image/png;base64,abc"}}) is True
assert _is_video_part({"type": "text", "video_url": "file.mp4"}) is False
assert _is_video_part({"video_url": None}) is False
assert _is_video_part({"video_url": "file.mp4"}) is True


def test_qwen3_vl_load_pil_image_reports_non_image_part():
with pytest.raises(TypeError, match="Expected image content part"):
_load_pil_image({"type": "text", "text": "hello", "image_url": None})