FIX: Adding openai invalid_prompt safety blocks as content filters (#1463)

rlundeen2 · web-flow · commit 351d79046a2f · 2026-03-13T13:49:03.000-07:00
diff --git a/pyrit/prompt_target/openai/openai_error_handling.py b/pyrit/prompt_target/openai/openai_error_handling.py
@@ -77,6 +77,13 @@ def _is_content_filter_error(data: Union[dict[str, object], str]) -> bool:
         code = error_obj.get("code") if isinstance(error_obj, dict) else None
         if code in ["content_filter", "moderation_blocked"]:
             return True
+        # OpenAI uses "invalid_prompt" for model-level safety blocks (e.g. CBRN topics).
+        # Only treat it as a content filter when the message indicates a safety block,
+        # not for other invalid_prompt reasons (e.g. malformed schemas).
+        if code == "invalid_prompt":
+            message = error_obj.get("message", "") if isinstance(error_obj, dict) else ""
+            if "limited access" in str(message).lower() or "safety" in str(message).lower():
+                return True
         # Heuristic: Azure sometimes uses other codes with policy-related content
         return "content_filter" in json.dumps(data).lower()
     # String-based heuristic search
diff --git a/tests/unit/target/test_openai_error_handling.py b/tests/unit/target/test_openai_error_handling.py
@@ -20,6 +20,23 @@ def test_is_content_filter_error_with_string():
     assert _is_content_filter_error(error_str) is True
 
 
+def test_is_content_filter_error_invalid_prompt_safety_block():
+    """Test detection with invalid_prompt code and safety-related message (CBRN block)"""
+    data = {
+        "error": {
+            "code": "invalid_prompt",
+            "message": "Invalid prompt: we've limited access to this content for safety reasons.",
+        }
+    }
+    assert _is_content_filter_error(data) is True
+
+
+def test_is_content_filter_error_invalid_prompt_non_safety():
+    """Test that invalid_prompt without a safety message is NOT treated as a content filter"""
+    data = {"error": {"code": "invalid_prompt", "message": "Invalid prompt: schema validation failed."}}
+    assert _is_content_filter_error(data) is False
+
+
 def test_is_content_filter_error_no_filter():
     """Test detection returns False when no content_filter"""
     error_dict = {"error": {"code": "rate_limit", "message": "Too many requests"}}