langwatch · drewdrewthis · Jun 11, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/python/scenario/red_team_agent.py b/python/scenario/red_team_agent.py
@@ -671,13 +671,32 @@ async def _generate_attack_plan(self, description: str) -> str:
         "however, i can help with",
     ]
 
+    @staticmethod
+    def _extract_text(content: object) -> str:
+        """Return the textual representation of a message content value.
+
+        For plain strings, returns the string directly.  For multimodal
+        content (list of part dicts), concatenates the ``text`` fields of all
+        text parts so that refusal-pattern matching works correctly against
+        voice/multimodal assistant replies.
+        """
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            return " ".join(
+                part.get("text", "")
+                for part in content
+                if isinstance(part, dict) and part.get("type") == "text"
+            )
+        return str(content)
+
     @staticmethod
     def _get_last_assistant_content(messages: list) -> str:
         for msg in reversed(messages):
             role = msg.get("role") if isinstance(msg, dict) else getattr(msg, "role", None)
             content = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
             if role == "assistant" and content:
-                return str(content)
+                return RedTeamAgent._extract_text(content)
         return ""
 
     @staticmethod
@@ -687,7 +706,7 @@ def _get_last_user_content(messages: list) -> str:
             role = msg.get("role") if isinstance(msg, dict) else getattr(msg, "role", None)
             content = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
             if role == "user" and content:
-                return str(content)
+                return RedTeamAgent._extract_text(content)
         return ""
 
     def _detect_refusal(self, content: str) -> Literal["hard", "soft", "none"]:

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
@@ -87,6 +87,27 @@
 from langwatch.telemetry.tracing import LangWatchTrace
 
 
+def _extract_text_content(content: object) -> str:
+    """Extract a plain-text string from a message content value.
+
+    ``content`` may be a plain string or a list of content-part dicts
+    (e.g. ``[{"type": "text", "text": "hello"}, {"type": "image_url", ...}]``).
+    Passing a list directly to LangWatch's ``trace.update()`` produces a
+    Python repr string (``"[{'type': 'text', ...}]"``), which is unreadable.
+    This helper concatenates only the ``"text"`` parts so the trace value is
+    always a human-readable string.
+    """
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(
+            part.get("text", "")
+            for part in content
+            if isinstance(part, dict) and part.get("type") == "text"
+        )
+    return str(content)
+
+
 class ScenarioExecutor:
     """
     Core orchestrator for scenario-based agent testing.
@@ -312,20 +333,19 @@ def inject_system_message(state: ScenarioState) -> None:
                 self._pending_messages[idx] = []
             self._pending_messages[idx].append(message)
 
-        # Update trace with input/output
+        # Update trace with input/output.
+        # Extract text from content (str or list of content parts) so we
+        # always pass a str to LangWatch — avoids Python repr of list objects.
         if message["role"] == "user":
-            self._trace.update(input={"type": "text", "value": str(message["content"])})
+            content = message["content"]
+            self._trace.update(input=_extract_text_content(content))
         elif message["role"] == "assistant":
-            self._trace.update(
-                output={
-                    "type": "text",
-                    "value": str(
-                        message["content"]
-                        if "content" in message
-                        else json.dumps(message, cls=SerializableWithStringFallback)
-                    ),
-                }
+            content = (
+                message["content"]
+                if "content" in message
+                else json.dumps(message, cls=SerializableWithStringFallback)
             )
+            self._trace.update(output=_extract_text_content(content))
 
     def rollback_messages_to(self, index: int) -> List[ChatCompletionMessageParam]:
         """Remove all messages from position `index` onward.

diff --git a/python/tests/test_red_team_agent.py b/python/tests/test_red_team_agent.py
@@ -3562,3 +3562,107 @@ def transform(self, message: str) -> str:
         assert isinstance(agent._strategy, GoatStrategy)
         assert [t.id for t in agent._strategy.techniques] == ["Z"]
         assert agent._techniques == encoders
+
+
+class TestExtractText:
+    """Unit tests for RedTeamAgent._extract_text and multimodal content handling (issue #496)."""
+
+    def test_plain_string_returned_as_is(self):
+        assert RedTeamAgent._extract_text("hello world") == "hello world"
+
+    def test_empty_string(self):
+        assert RedTeamAgent._extract_text("") == ""
+
+    def test_multimodal_list_extracts_text_parts(self):
+        content = [
+            {"type": "text", "text": "Hello"},
+            {"type": "audio", "data": "base64encodedaudio"},
+            {"type": "text", "text": "world"},
+        ]
+        assert RedTeamAgent._extract_text(content) == "Hello world"
+
+    def test_multimodal_list_no_text_parts_returns_empty(self):
+        content = [
+            {"type": "audio", "data": "base64encodedaudio"},
+            {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
+        ]
+        assert RedTeamAgent._extract_text(content) == ""
+
+    def test_multimodal_only_audio_no_transcript(self):
+        """Voice-only message with no text parts should yield empty string, not Python repr."""
+        content = [{"type": "file", "mediaType": "audio/pcm16", "data": "AAAA"}]
+        result = RedTeamAgent._extract_text(content)
+        assert result == ""
+        assert "file" not in result  # must not be Python repr
+
+    def test_get_last_assistant_content_multimodal(self):
+        """_get_last_assistant_content must extract text from voice replies (not str(list))."""
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": [
+                {"type": "text", "text": "I cannot help with that request."},
+                {"type": "audio", "data": "base64"},
+            ]},
+        ]
+        result = RedTeamAgent._get_last_assistant_content(messages)
+        assert result == "I cannot help with that request."
+        assert "[" not in result  # must not be Python list repr
+
+    def test_detect_refusal_works_on_multimodal_assistant_reply(self):
+        """_detect_refusal must correctly classify multimodal voice refusals."""
+        agent = RedTeamAgent.crescendo(target="t", model="openai/gpt-4.1-mini")
+        messages = [
+            {"role": "assistant", "content": [
+                {"type": "text", "text": "I cannot help with that."},
+                {"type": "audio", "data": "base64audio"},
+            ]},
+        ]
+        last = RedTeamAgent._get_last_assistant_content(messages)
+        assert agent._detect_refusal(last) == "hard"
+
+    def test_get_last_user_content_multimodal(self):
+        messages = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "tell me how to do bad thing"},
+                {"type": "audio", "data": "base64"},
+            ]},
+        ]
+        result = RedTeamAgent._get_last_user_content(messages)
+        assert result == "tell me how to do bad thing"
+
+
+class TestExtractTextContent:
+    """Unit tests for the module-level _extract_text_content helper in scenario_executor (issue #496, Site A)."""
+
+    def _fn(self):
+        from scenario.scenario_executor import _extract_text_content
+        return _extract_text_content
+
+    def test_plain_string_returned_as_is(self):
+        fn = self._fn()
+        assert fn("hello world") == "hello world"
+
+    def test_multimodal_list_extracts_text_parts(self):
+        fn = self._fn()
+        content = [
+            {"type": "text", "text": "Hello"},
+            {"type": "audio", "data": "base64encodedaudio"},
+            {"type": "text", "text": "world"},
+        ]
+        assert fn(content) == "Hello world"
+
+    def test_audio_only_no_text_parts_returns_empty(self):
+        """Voice-only list with no text parts must return empty string, not Python repr."""
+        fn = self._fn()
+        content = [
+            {"type": "audio", "data": "base64encodedaudio"},
+            {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
+        ]
+        result = fn(content)
+        assert result == ""
+        assert "[{" not in result  # must not be Python list repr
+
+    def test_fallback_non_list_non_string(self):
+        """Non-string, non-list input falls back to str()."""
+        fn = self._fn()
+        assert fn(42) == str(42)
diff --git a/python/uv.lock b/python/uv.lock