Skip to content

Commit 351d790

Browse files
authored
FIX: Adding openai invalid_prompt safety blocks as content filters (#1463)
1 parent 8ad0d2b commit 351d790

2 files changed

Lines changed: 24 additions & 0 deletions

File tree

pyrit/prompt_target/openai/openai_error_handling.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ def _is_content_filter_error(data: Union[dict[str, object], str]) -> bool:
7777
code = error_obj.get("code") if isinstance(error_obj, dict) else None
7878
if code in ["content_filter", "moderation_blocked"]:
7979
return True
80+
# OpenAI uses "invalid_prompt" for model-level safety blocks (e.g. CBRN topics).
81+
# Only treat it as a content filter when the message indicates a safety block,
82+
# not for other invalid_prompt reasons (e.g. malformed schemas).
83+
if code == "invalid_prompt":
84+
message = error_obj.get("message", "") if isinstance(error_obj, dict) else ""
85+
if "limited access" in str(message).lower() or "safety" in str(message).lower():
86+
return True
8087
# Heuristic: Azure sometimes uses other codes with policy-related content
8188
return "content_filter" in json.dumps(data).lower()
8289
# String-based heuristic search

tests/unit/target/test_openai_error_handling.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,23 @@ def test_is_content_filter_error_with_string():
2020
assert _is_content_filter_error(error_str) is True
2121

2222

23+
def test_is_content_filter_error_invalid_prompt_safety_block():
24+
"""Test detection with invalid_prompt code and safety-related message (CBRN block)"""
25+
data = {
26+
"error": {
27+
"code": "invalid_prompt",
28+
"message": "Invalid prompt: we've limited access to this content for safety reasons.",
29+
}
30+
}
31+
assert _is_content_filter_error(data) is True
32+
33+
34+
def test_is_content_filter_error_invalid_prompt_non_safety():
35+
"""Test that invalid_prompt without a safety message is NOT treated as a content filter"""
36+
data = {"error": {"code": "invalid_prompt", "message": "Invalid prompt: schema validation failed."}}
37+
assert _is_content_filter_error(data) is False
38+
39+
2340
def test_is_content_filter_error_no_filter():
2441
"""Test detection returns False when no content_filter"""
2542
error_dict = {"error": {"code": "rate_limit", "message": "Too many requests"}}

0 commit comments

Comments
 (0)