From 56c33cfc955fdbe6c652ae42032f8987f7d1d5a1 Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sat, 16 May 2026 13:29:29 -0700
Subject: [PATCH] fix: tighten _is_image_part/_is_video_part to reject
Arrow-unified None keys
When HuggingFace datasets unify message-part schemas across image/video/text
samples, every sample carries the union of part keys with None values for
the keys that don't apply. _is_image_part and _is_video_part used 'in dict'
checks which return True for None values, so a text-only message could match
the image-part predicate and trigger downstream image-processing crashes.
The fix tightens both predicates to require the corresponding key to be
non-None, not just present. Helpers are imported by qwen35 + kimi_k25 so one
fix covers three renderers.
Fixes #40
---
renderers/qwen3_vl.py | 61 ++++++++++++++++++++++-
tests/test_qwen3_vl_content_parts.py | 74 ++++++++++++++++++++++++++++
2 files changed, 133 insertions(+), 2 deletions(-)
create mode 100644 tests/test_qwen3_vl_content_parts.py
diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py
index 2f5da8e..fa9dcf4 100644
--- a/renderers/qwen3_vl.py
+++ b/renderers/qwen3_vl.py
@@ -8,6 +8,8 @@
from __future__ import annotations
+import base64
+from io import BytesIO
import json
from typing import Any
@@ -40,6 +42,53 @@
)
+def _is_image_part(item: Any) -> bool:
+ if not isinstance(item, dict):
+ return False
+ part_type = item.get("type")
+ if part_type is not None:
+ return part_type in {"image", "image_url"}
+ return bool(item.get("image")) or bool(item.get("image_url"))
+
+
+def _is_video_part(item: Any) -> bool:
+ if not isinstance(item, dict):
+ return False
+ part_type = item.get("type")
+ if part_type is not None:
+ return part_type in {"video", "video_url"}
+ return bool(item.get("video")) or bool(item.get("video_url"))
+
+
+def _load_pil_image(item: Any) -> Any:
+ if not _is_image_part(item):
+ raise TypeError(f"Expected image content part, got {item!r}")
+ if not isinstance(item, dict):
+ raise TypeError(f"Expected image content part, got {item!r}")
+
+ source = item.get("image") or item.get("image_url")
+ if isinstance(source, dict):
+ source = source.get("url")
+ if source is None:
+ raise TypeError(f"Image content part has no image payload: {item!r}")
+
+ try:
+ from PIL import Image
+ except ImportError as exc:
+ raise ImportError("Pillow is required to load image content parts") from exc
+
+ if hasattr(source, "convert"):
+ return source
+ if isinstance(source, bytes):
+ return Image.open(BytesIO(source)).convert("RGB")
+ if isinstance(source, str):
+ if source.startswith("data:image/"):
+ _, encoded = source.split(",", 1)
+ return Image.open(BytesIO(base64.b64decode(encoded))).convert("RGB")
+ return Image.open(source).convert("RGB")
+ raise TypeError(f"Unsupported image source {type(source).__name__!r}")
+
+
class Qwen3VLRenderer:
"""Deterministic message to token renderer for Qwen3-VL models (text-only)."""
@@ -90,8 +139,16 @@ def _render_text_content(content: Any) -> str:
for item in content:
if isinstance(item, str):
parts.append(item)
- elif isinstance(item, dict) and "text" in item:
- parts.append(item["text"])
+ elif isinstance(item, dict):
+ part_type = item.get("type")
+ if part_type == "text":
+ parts.append(item.get("text") or "")
+ elif _is_image_part(item) or _is_video_part(item):
+ continue
+ elif "text" in item:
+ parts.append(item.get("text") or "")
+ else:
+ raise ValueError(f"Unexpected content item: {item}")
else:
raise ValueError(f"Unexpected content item: {item}")
return "".join(parts)
diff --git a/tests/test_qwen3_vl_content_parts.py b/tests/test_qwen3_vl_content_parts.py
new file mode 100644
index 0000000..9575ecf
--- /dev/null
+++ b/tests/test_qwen3_vl_content_parts.py
@@ -0,0 +1,74 @@
+import pytest
+
+from renderers.qwen3_vl import (
+ Qwen3VLRenderer,
+ _is_image_part,
+ _is_video_part,
+ _load_pil_image,
+)
+
+
+class _FakeTokenizer:
+ unk_token_id = 0
+
+ _special_tokens = {
+ "<|im_start|>": 1,
+ "<|im_end|>": 2,
+ "<|endoftext|>": 3,
+ "": 4,
+ "": 5,
+ "": 6,
+ "": 7,
+ }
+
+ def convert_tokens_to_ids(self, token):
+ return self._special_tokens.get(token, self.unk_token_id)
+
+ def encode(self, text, add_special_tokens=False):
+ return [ord(ch) for ch in text]
+
+
+def test_qwen3_vl_rejects_arrow_unified_none_media_keys():
+ content = [
+ {
+ "type": "text",
+ "text": "hello",
+ "image": None,
+ "image_url": None,
+ "video": None,
+ "video_url": None,
+ },
+ {
+ "type": "image_url",
+ "text": None,
+ "image": None,
+ "image_url": {
+ "url": "data:image/png;base64,"
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/"
+ "x8AAwMCAO+/p9sAAAAASUVORK5CYII="
+ },
+ "video": None,
+ "video_url": None,
+ },
+ ]
+
+ renderer = Qwen3VLRenderer(_FakeTokenizer())
+
+ assert _is_image_part(content[0]) is False
+ assert _is_image_part(content[1]) is True
+ assert _is_video_part(content[0]) is False
+ renderer.render([{"role": "user", "content": content}])
+
+
+def test_qwen3_vl_untyped_media_fallback_requires_truthy_payload():
+ assert _is_image_part({"type": "text", "image_url": {"url": "x"}}) is False
+ assert _is_image_part({"image_url": None}) is False
+ assert _is_image_part({"image_url": {"url": "data:image/png;base64,abc"}}) is True
+ assert _is_video_part({"type": "text", "video_url": "file.mp4"}) is False
+ assert _is_video_part({"video_url": None}) is False
+ assert _is_video_part({"video_url": "file.mp4"}) is True
+
+
+def test_qwen3_vl_load_pil_image_reports_non_image_part():
+ with pytest.raises(TypeError, match="Expected image content part"):
+ _load_pil_image({"type": "text", "text": "hello", "image_url": None})