From 56c33cfc955fdbe6c652ae42032f8987f7d1d5a1 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Sat, 16 May 2026 13:29:29 -0700 Subject: [PATCH] fix: tighten _is_image_part/_is_video_part to reject Arrow-unified None keys When HuggingFace datasets unify message-part schemas across image/video/text samples, every sample carries the union of part keys with None values for the keys that don't apply. _is_image_part and _is_video_part used 'in dict' checks which return True for None values, so a text-only message could match the image-part predicate and trigger downstream image-processing crashes. The fix tightens both predicates to require the corresponding key to be non-None, not just present. Helpers are imported by qwen35 + kimi_k25 so one fix covers three renderers. Fixes #40 --- renderers/qwen3_vl.py | 61 ++++++++++++++++++++++- tests/test_qwen3_vl_content_parts.py | 74 ++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 tests/test_qwen3_vl_content_parts.py diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 2f5da8e..fa9dcf4 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -8,6 +8,8 @@ from __future__ import annotations +import base64 +from io import BytesIO import json from typing import Any @@ -40,6 +42,53 @@ ) +def _is_image_part(item: Any) -> bool: + if not isinstance(item, dict): + return False + part_type = item.get("type") + if part_type is not None: + return part_type in {"image", "image_url"} + return bool(item.get("image")) or bool(item.get("image_url")) + + +def _is_video_part(item: Any) -> bool: + if not isinstance(item, dict): + return False + part_type = item.get("type") + if part_type is not None: + return part_type in {"video", "video_url"} + return bool(item.get("video")) or bool(item.get("video_url")) + + +def _load_pil_image(item: Any) -> Any: + if not _is_image_part(item): + raise TypeError(f"Expected image content part, got {item!r}") + if not isinstance(item, dict): + raise TypeError(f"Expected image content part, got {item!r}") + + source = item.get("image") or item.get("image_url") + if isinstance(source, dict): + source = source.get("url") + if source is None: + raise TypeError(f"Image content part has no image payload: {item!r}") + + try: + from PIL import Image + except ImportError as exc: + raise ImportError("Pillow is required to load image content parts") from exc + + if hasattr(source, "convert"): + return source + if isinstance(source, bytes): + return Image.open(BytesIO(source)).convert("RGB") + if isinstance(source, str): + if source.startswith("data:image/"): + _, encoded = source.split(",", 1) + return Image.open(BytesIO(base64.b64decode(encoded))).convert("RGB") + return Image.open(source).convert("RGB") + raise TypeError(f"Unsupported image source {type(source).__name__!r}") + + class Qwen3VLRenderer: """Deterministic message to token renderer for Qwen3-VL models (text-only).""" @@ -90,8 +139,16 @@ def _render_text_content(content: Any) -> str: for item in content: if isinstance(item, str): parts.append(item) - elif isinstance(item, dict) and "text" in item: - parts.append(item["text"]) + elif isinstance(item, dict): + part_type = item.get("type") + if part_type == "text": + parts.append(item.get("text") or "") + elif _is_image_part(item) or _is_video_part(item): + continue + elif "text" in item: + parts.append(item.get("text") or "") + else: + raise ValueError(f"Unexpected content item: {item}") else: raise ValueError(f"Unexpected content item: {item}") return "".join(parts) diff --git a/tests/test_qwen3_vl_content_parts.py b/tests/test_qwen3_vl_content_parts.py new file mode 100644 index 0000000..9575ecf --- /dev/null +++ b/tests/test_qwen3_vl_content_parts.py @@ -0,0 +1,74 @@ +import pytest + +from renderers.qwen3_vl import ( + Qwen3VLRenderer, + _is_image_part, + _is_video_part, + _load_pil_image, +) + + +class _FakeTokenizer: + unk_token_id = 0 + + _special_tokens = { + "<|im_start|>": 1, + "<|im_end|>": 2, + "<|endoftext|>": 3, + "": 4, + "": 5, + "": 6, + "": 7, + } + + def convert_tokens_to_ids(self, token): + return self._special_tokens.get(token, self.unk_token_id) + + def encode(self, text, add_special_tokens=False): + return [ord(ch) for ch in text] + + +def test_qwen3_vl_rejects_arrow_unified_none_media_keys(): + content = [ + { + "type": "text", + "text": "hello", + "image": None, + "image_url": None, + "video": None, + "video_url": None, + }, + { + "type": "image_url", + "text": None, + "image": None, + "image_url": { + "url": "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/" + "x8AAwMCAO+/p9sAAAAASUVORK5CYII=" + }, + "video": None, + "video_url": None, + }, + ] + + renderer = Qwen3VLRenderer(_FakeTokenizer()) + + assert _is_image_part(content[0]) is False + assert _is_image_part(content[1]) is True + assert _is_video_part(content[0]) is False + renderer.render([{"role": "user", "content": content}]) + + +def test_qwen3_vl_untyped_media_fallback_requires_truthy_payload(): + assert _is_image_part({"type": "text", "image_url": {"url": "x"}}) is False + assert _is_image_part({"image_url": None}) is False + assert _is_image_part({"image_url": {"url": "data:image/png;base64,abc"}}) is True + assert _is_video_part({"type": "text", "video_url": "file.mp4"}) is False + assert _is_video_part({"video_url": None}) is False + assert _is_video_part({"video_url": "file.mp4"}) is True + + +def test_qwen3_vl_load_pil_image_reports_non_image_part(): + with pytest.raises(TypeError, match="Expected image content part"): + _load_pil_image({"type": "text", "text": "hello", "image_url": None})