PrimeIntellect-ai · mvanhorn · May 16, 2026
diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+import base64
+from io import BytesIO
 import json
 from typing import Any
 
@@ -40,6 +42,53 @@
 )
 
 
+def _is_image_part(item: Any) -> bool:
+    if not isinstance(item, dict):
+        return False
+    part_type = item.get("type")
+    if part_type is not None:
+        return part_type in {"image", "image_url"}
+    return bool(item.get("image")) or bool(item.get("image_url"))
+
+
+def _is_video_part(item: Any) -> bool:
+    if not isinstance(item, dict):
+        return False
+    part_type = item.get("type")
+    if part_type is not None:
+        return part_type in {"video", "video_url"}
+    return bool(item.get("video")) or bool(item.get("video_url"))
+
+
+def _load_pil_image(item: Any) -> Any:
+    if not _is_image_part(item):
+        raise TypeError(f"Expected image content part, got {item!r}")
+    if not isinstance(item, dict):
+        raise TypeError(f"Expected image content part, got {item!r}")
+
+    source = item.get("image") or item.get("image_url")
+    if isinstance(source, dict):
+        source = source.get("url")
+    if source is None:
+        raise TypeError(f"Image content part has no image payload: {item!r}")
+
+    try:
+        from PIL import Image
+    except ImportError as exc:
+        raise ImportError("Pillow is required to load image content parts") from exc
+
+    if hasattr(source, "convert"):
+        return source
+    if isinstance(source, bytes):
+        return Image.open(BytesIO(source)).convert("RGB")
+    if isinstance(source, str):
+        if source.startswith("data:image/"):
+            _, encoded = source.split(",", 1)
+            return Image.open(BytesIO(base64.b64decode(encoded))).convert("RGB")
+        return Image.open(source).convert("RGB")
+    raise TypeError(f"Unsupported image source {type(source).__name__!r}")
+
+
 class Qwen3VLRenderer:
     """Deterministic message to token renderer for Qwen3-VL models (text-only)."""
 
@@ -90,8 +139,16 @@ def _render_text_content(content: Any) -> str:
             for item in content:
                 if isinstance(item, str):
                     parts.append(item)
-                elif isinstance(item, dict) and "text" in item:
-                    parts.append(item["text"])
+                elif isinstance(item, dict):
+                    part_type = item.get("type")
+                    if part_type == "text":
+                        parts.append(item.get("text") or "")
+                    elif _is_image_part(item) or _is_video_part(item):
+                        continue
+                    elif "text" in item:
+                        parts.append(item.get("text") or "")
+                    else:
+                        raise ValueError(f"Unexpected content item: {item}")
                 else:
                     raise ValueError(f"Unexpected content item: {item}")
             return "".join(parts)

diff --git a/tests/test_qwen3_vl_content_parts.py b/tests/test_qwen3_vl_content_parts.py
@@ -0,0 +1,74 @@
+import pytest
+
+from renderers.qwen3_vl import (
+    Qwen3VLRenderer,
+    _is_image_part,
+    _is_video_part,
+    _load_pil_image,
+)
+
+
+class _FakeTokenizer:
+    unk_token_id = 0
+
+    _special_tokens = {
+        "<|im_start|>": 1,
+        "<|im_end|>": 2,
+        "<|endoftext|>": 3,
+        "<tool_call>": 4,
+        "</tool_call>": 5,
+        "<tool_response>": 6,
+        "</tool_response>": 7,
+    }
+
+    def convert_tokens_to_ids(self, token):
+        return self._special_tokens.get(token, self.unk_token_id)
+
+    def encode(self, text, add_special_tokens=False):
+        return [ord(ch) for ch in text]
+
+
+def test_qwen3_vl_rejects_arrow_unified_none_media_keys():
+    content = [
+        {
+            "type": "text",
+            "text": "hello",
+            "image": None,
+            "image_url": None,
+            "video": None,
+            "video_url": None,
+        },
+        {
+            "type": "image_url",
+            "text": None,
+            "image": None,
+            "image_url": {
+                "url": "data:image/png;base64,"
+                "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/"
+                "x8AAwMCAO+/p9sAAAAASUVORK5CYII="
+            },
+            "video": None,
+            "video_url": None,
+        },
+    ]
+
+    renderer = Qwen3VLRenderer(_FakeTokenizer())
+
+    assert _is_image_part(content[0]) is False
+    assert _is_image_part(content[1]) is True
+    assert _is_video_part(content[0]) is False
+    renderer.render([{"role": "user", "content": content}])
+
+
+def test_qwen3_vl_untyped_media_fallback_requires_truthy_payload():
+    assert _is_image_part({"type": "text", "image_url": {"url": "x"}}) is False
+    assert _is_image_part({"image_url": None}) is False
+    assert _is_image_part({"image_url": {"url": "data:image/png;base64,abc"}}) is True
+    assert _is_video_part({"type": "text", "video_url": "file.mp4"}) is False
+    assert _is_video_part({"video_url": None}) is False
+    assert _is_video_part({"video_url": "file.mp4"}) is True
+
+
+def test_qwen3_vl_load_pil_image_reports_non_image_part():
+    with pytest.raises(TypeError, match="Expected image content part"):
+        _load_pil_image({"type": "text", "text": "hello", "image_url": None})