diff --git a/eval_protocol/benchmarks/test_glm_streaming_compliance.py b/eval_protocol/benchmarks/test_glm_streaming_compliance.py
index 26b46a9c..918fdfa9 100644
--- a/eval_protocol/benchmarks/test_glm_streaming_compliance.py
+++ b/eval_protocol/benchmarks/test_glm_streaming_compliance.py
@@ -5,6 +5,8 @@
 import re
 from typing import Any
 
+import pytest
+
 from eval_protocol.models import (
     EvaluateResult,
     EvaluationRow,
@@ -17,9 +19,15 @@
 from eval_protocol.pytest.evaluation_test import evaluation_test
 
 
-DEFAULT_MODEL_ID = "fireworks_ai/accounts/pyroworks/deployedModels/minimax-m2-zmi4qk9f"
+DEFAULT_MODEL_ID = "fireworks_ai/accounts/fireworks/models/glm-4p6"
 DEFAULT_MAX_TOKENS = 10000
 
+# Feature flags from environment variables
+# EP_SUPPORTS_MULTIPLE_TOOL_CALLS: "1" to include multiple tool call tests, "0" to skip
+SUPPORTS_MULTIPLE_TOOL_CALLS = os.getenv("EP_SUPPORTS_MULTIPLE_TOOL_CALLS", "1") == "1"
+# EP_SUPPORTS_REASONING: "1" to include reasoning tests and pass reasoning_effort, "0" to skip reasoning tests
+SUPPORTS_REASONING = os.getenv("EP_SUPPORTS_REASONING", "1") == "1"
+
 
 def _coerce_content_to_str(
     content: str | list[Any] | None,
@@ -509,15 +517,30 @@ def _build_completion_params_from_payload(payload: dict[str, Any]) -> dict[str,
         "model": DEFAULT_MODEL_ID,
         "stream": True,
         "return_reasoning_with_separate_field": True,
-        "reasoning_effort": "none",  # Default: no reasoning unless explicitly requested
+        "raw_output": True,  # Include raw model output for debugging
     }
-    passthrough_keys = {"temperature", "top_p", "max_tokens", "response_format", "reasoning_effort"}
+    # Only include reasoning_effort if model supports it
+    if SUPPORTS_REASONING:
+        params["reasoning_effort"] = "none"  # Default: no reasoning unless explicitly requested
+
+    passthrough_keys = {"temperature", "top_p", "max_tokens", "response_format"}
+    # Only passthrough reasoning_effort if model supports it
+    if SUPPORTS_REASONING:
+        passthrough_keys.add("reasoning_effort")
+
     for key in passthrough_keys:
         if key in payload:
             params[key] = payload[key]
     return params
 
 
+def _maybe_add_reasoning_effort(params: dict[str, Any], effort: str = "low") -> dict[str, Any]:
+    """Conditionally add reasoning_effort to params if model supports it."""
+    if SUPPORTS_REASONING:
+        params["reasoning_effort"] = effort
+    return params
+
+
 def _normalize_tool_call(tc: Any) -> tuple[str | None, dict[str, Any] | None]:
     """Convert LiteLLM tool call objects/dicts into (name, arguments dict)."""
 
@@ -688,15 +711,18 @@ def _debug_log_assistant_message(test_name: str, assistant_message: Message | No
 @evaluation_test(
     input_rows=[[STRUCTURED_OUTPUT_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": True,
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "response_format": STRUCTURED_RESPONSE_FORMAT,
-            "reasoning_effort": "none",  # No reasoning expected for structured output
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": True,
+                "temperature": 1.0,
+                "top_p": 1.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "response_format": STRUCTURED_RESPONSE_FORMAT,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "none",  # No reasoning expected for structured output
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
@@ -957,14 +983,17 @@ def test_streaming_json_preservation(row: EvaluationRow) -> EvaluationRow:
 @evaluation_test(
     input_rows=[[TOOL_CALL_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": True,
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "reasoning_effort": "none",  # No reasoning expected for tool calls
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": True,
+                "temperature": 1.0,
+                "top_p": 1.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "none",  # No reasoning expected for tool calls
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
@@ -1236,6 +1265,10 @@ def test_streaming_tool_complex_arguments(row: EvaluationRow) -> EvaluationRow:
 _MULTI_TOOL_CALLS_ROW = _build_row_from_payload("multi-tool-calls", MULTI_TOOL_CALLS_PAYLOAD)
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_MULTIPLE_TOOL_CALLS,
+    reason="Model does not support multiple tool calls (EP_SUPPORTS_MULTIPLE_TOOL_CALLS=0)",
+)
 @evaluation_test(
     input_rows=[[_MULTI_TOOL_CALLS_ROW]],
     completion_params=[_build_completion_params_from_payload(MULTI_TOOL_CALLS_PAYLOAD)],
@@ -1681,6 +1714,10 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_REASONING,
+    reason="Model does not support reasoning_effort parameter (EP_SUPPORTS_REASONING=0)",
+)
 @evaluation_test(
     input_rows=[[REASONING_DISABLED_ROW]],
     completion_params=[
@@ -1690,6 +1727,7 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
             "max_tokens": DEFAULT_MAX_TOKENS,
             "temperature": 0.0,
             "stream": True,
+            "raw_output": True,  # Include raw model output for debugging
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
@@ -1791,6 +1829,10 @@ def test_reasoning_effort_none_no_reasoning(row: EvaluationRow) -> EvaluationRow
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_REASONING,
+    reason="Model does not support reasoning_effort parameter (EP_SUPPORTS_REASONING=0)",
+)
 @evaluation_test(
     input_rows=[[REASONING_ENABLED_ROW]],
     completion_params=[
@@ -1800,6 +1842,7 @@ def test_reasoning_effort_none_no_reasoning(row: EvaluationRow) -> EvaluationRow
             "max_tokens": DEFAULT_MAX_TOKENS,
             "temperature": 0.0,
             "stream": True,
+            "raw_output": True,  # Include raw model output for debugging
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
@@ -1929,13 +1972,16 @@ def test_reasoning_effort_low_has_reasoning(row: EvaluationRow) -> EvaluationRow
 @evaluation_test(
     input_rows=[[TOOLS_WITH_REASONING_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,  # Reasoning-capable model
-            "reasoning_effort": "low",  # Enable reasoning
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "temperature": 0.0,
-            "stream": True,
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,  # Reasoning-capable model
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "temperature": 0.0,
+                "stream": True,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -1946,7 +1992,7 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow:
     Verify that streaming works correctly when BOTH tools and reasoning are present.
 
     Requirements:
-    - reasoning_content should be present
+    - reasoning_content should be present (if SUPPORTS_REASONING)
     - tool_calls should be present
     - finish_reason should be "tool_calls"
     - No XML tags or reasoning leakage
@@ -1973,12 +2019,6 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow:
                     break
 
     metrics = {
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "has_tool_calls": MetricResult(
             score=1.0 if has_tool_calls else 0.0,
             is_score_valid=True,
@@ -2000,13 +2040,22 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow:
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
-        reasoning_present
-        and has_tool_calls
+        has_tool_calls
         and finish_reason_tool_calls
         and tool_call_valid
         and finish_reason_present
@@ -2014,10 +2063,13 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow:
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     # Build detailed failure reason
     failure_reasons = []
-    if not reasoning_present:
+    if SUPPORTS_REASONING and not reasoning_present:
         failure_reasons.append("reasoning_content missing")
     if not has_tool_calls:
         failure_reasons.append("no tool calls")
@@ -2216,15 +2268,18 @@ async def test_streaming_output_consistency(row: EvaluationRow) -> EvaluationRow
 @evaluation_test(
     input_rows=[[STRUCTURED_OUTPUT_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": False,  # Non-streaming
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "response_format": STRUCTURED_RESPONSE_FORMAT,
-            "reasoning_effort": "none",
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": False,  # Non-streaming
+                "temperature": 1.0,
+                "top_p": 1.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "response_format": STRUCTURED_RESPONSE_FORMAT,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "none",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
@@ -2424,14 +2479,17 @@ def test_non_streaming_simple_completion(row: EvaluationRow) -> EvaluationRow:
 @evaluation_test(
     input_rows=[[TOOL_CALL_NON_STREAM_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": False,  # Non-streaming
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "reasoning_effort": "none",
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": False,  # Non-streaming
+                "temperature": 1.0,
+                "top_p": 1.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "none",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
@@ -2555,6 +2613,10 @@ def test_non_streaming_single_tool_call(row: EvaluationRow) -> EvaluationRow:
 _MULTI_TOOL_CALLS_NON_STREAM_ROW = _build_row_from_payload("multi-tool-calls-non-stream", MULTI_TOOL_CALLS_PAYLOAD)
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_MULTIPLE_TOOL_CALLS,
+    reason="Model does not support multiple tool calls (EP_SUPPORTS_MULTIPLE_TOOL_CALLS=0)",
+)
 @evaluation_test(
     input_rows=[[_MULTI_TOOL_CALLS_NON_STREAM_ROW]],
     completion_params=[
@@ -2649,6 +2711,10 @@ def test_non_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_REASONING,
+    reason="Model does not support reasoning_effort parameter (EP_SUPPORTS_REASONING=0)",
+)
 @evaluation_test(
     input_rows=[[REASONING_DISABLED_NON_STREAM_ROW]],
     completion_params=[
@@ -2658,6 +2724,7 @@ def test_non_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
             "max_tokens": DEFAULT_MAX_TOKENS,
             "temperature": 0.0,
             "stream": False,  # Non-streaming
+            "raw_output": True,  # Include raw model output for debugging
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
@@ -2756,6 +2823,10 @@ def test_reasoning_effort_none_no_reasoning_non_stream(row: EvaluationRow) -> Ev
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_REASONING,
+    reason="Model does not support reasoning_effort parameter (EP_SUPPORTS_REASONING=0)",
+)
 @evaluation_test(
     input_rows=[[REASONING_ENABLED_NON_STREAM_ROW]],
     completion_params=[
@@ -2765,6 +2836,7 @@ def test_reasoning_effort_none_no_reasoning_non_stream(row: EvaluationRow) -> Ev
             "max_tokens": DEFAULT_MAX_TOKENS,
             "temperature": 0.0,
             "stream": False,  # Non-streaming
+            "raw_output": True,  # Include raw model output for debugging
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
@@ -2887,13 +2959,16 @@ def test_reasoning_effort_low_has_reasoning_non_stream(row: EvaluationRow) -> Ev
 @evaluation_test(
     input_rows=[[TOOLS_WITH_REASONING_NON_STREAM_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "reasoning_effort": "low",
-            "max_tokens": DEFAULT_MAX_TOKENS,
-            "temperature": 0.0,
-            "stream": False,  # Non-streaming
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "temperature": 0.0,
+                "stream": False,  # Non-streaming
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -2923,12 +2998,6 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
                     break
 
     metrics = {
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "has_tool_calls": MetricResult(
             score=1.0 if has_tool_calls else 0.0,
             is_score_valid=True,
@@ -2950,13 +3019,22 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
-        reasoning_present
-        and has_tool_calls
+        has_tool_calls
         and finish_reason_tool_calls
         and tool_call_valid
         and finish_reason_present
@@ -2964,10 +3042,13 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     # Build detailed failure reason
     failure_reasons = []
-    if not reasoning_present:
+    if SUPPORTS_REASONING and not reasoning_present:
         failure_reasons.append("reasoning_content missing")
     if not has_tool_calls:
         failure_reasons.append("no tool calls")
@@ -3033,14 +3114,17 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
 @evaluation_test(
     input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": True,
-            "reasoning_effort": "low",
-            "response_format": STRUCTURED_JSON_SCHEMA,
-            "temperature": 0.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": True,
+                "response_format": STRUCTURED_JSON_SCHEMA,
+                "temperature": 0.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -3079,12 +3163,6 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu
             is_score_valid=content_is_json,
             reason="speed_kmh is numeric" if speed_is_number else "speed_kmh not numeric",
         ),
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "finish_reason_stop": MetricResult(
             score=1.0 if finish_reason_stop else 0.0,
             is_score_valid=True,
@@ -3092,21 +3170,33 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
         content_is_json
         and has_required_keys
         and speed_is_number
-        and reasoning_present
         and finish_reason_stop
         and finish_reason_present
         and no_forbidden_tags
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     row.evaluation_result = EvaluateResult(
         score=1.0 if all_checks_passed else 0.0,
@@ -3136,14 +3226,17 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu
 @evaluation_test(
     input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": False,
-            "reasoning_effort": "low",
-            "response_format": STRUCTURED_JSON_SCHEMA,
-            "temperature": 0.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": False,
+                "response_format": STRUCTURED_JSON_SCHEMA,
+                "temperature": 0.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -3182,12 +3275,6 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E
             is_score_valid=content_is_json,
             reason="speed_kmh is numeric" if speed_is_number else "speed_kmh not numeric",
         ),
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "finish_reason_stop": MetricResult(
             score=1.0 if finish_reason_stop else 0.0,
             is_score_valid=True,
@@ -3195,21 +3282,33 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
         content_is_json
         and has_required_keys
         and speed_is_number
-        and reasoning_present
         and finish_reason_stop
         and finish_reason_present
         and no_forbidden_tags
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     row.evaluation_result = EvaluateResult(
         score=1.0 if all_checks_passed else 0.0,
@@ -3256,16 +3355,23 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_MULTIPLE_TOOL_CALLS,
+    reason="Model does not support multiple tool calls (EP_SUPPORTS_MULTIPLE_TOOL_CALLS=0)",
+)
 @evaluation_test(
     input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": True,
-            "reasoning_effort": "low",
-            "temperature": 0.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": True,
+                "temperature": 0.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -3299,12 +3405,6 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
     all_cities_covered = len(cities_covered) == 3
 
     metrics = {
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "has_multiple_tools": MetricResult(
             score=1.0 if has_multiple_tools else 0.0,
             is_score_valid=True,
@@ -3324,13 +3424,22 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
-        reasoning_present
-        and has_multiple_tools
+        has_multiple_tools
         and all_cities_covered
         and finish_reason_tool_calls
         and finish_reason_present
@@ -3338,6 +3447,9 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     row.evaluation_result = EvaluateResult(
         score=1.0 if all_checks_passed else 0.0,
@@ -3383,16 +3495,23 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
 }
 
 
+@pytest.mark.skipif(
+    not SUPPORTS_MULTIPLE_TOOL_CALLS,
+    reason="Model does not support multiple tool calls (EP_SUPPORTS_MULTIPLE_TOOL_CALLS=0)",
+)
 @evaluation_test(
     input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW]],
     completion_params=[
-        {
-            "model": DEFAULT_MODEL_ID,
-            "stream": False,
-            "reasoning_effort": "low",
-            "temperature": 0.0,
-            "max_tokens": DEFAULT_MAX_TOKENS,
-        }
+        _maybe_add_reasoning_effort(
+            {
+                "model": DEFAULT_MODEL_ID,
+                "stream": False,
+                "temperature": 0.0,
+                "max_tokens": DEFAULT_MAX_TOKENS,
+                "raw_output": True,  # Include raw model output for debugging
+            },
+            "low",
+        )
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=1.0,
@@ -3426,12 +3545,6 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval
     all_cities_covered = len(cities_covered) == 3
 
     metrics = {
-        "reasoning_present": MetricResult(
-            score=1.0 if reasoning_present else 0.0,
-            is_score_valid=True,
-            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
-            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
-        ),
         "has_multiple_tools": MetricResult(
             score=1.0 if has_multiple_tools else 0.0,
             is_score_valid=True,
@@ -3451,13 +3564,22 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval
         ),
     }
 
+    # Only add reasoning_present metric if model supports reasoning
+    if SUPPORTS_REASONING:
+        metrics["reasoning_present"] = MetricResult(
+            score=1.0 if reasoning_present else 0.0,
+            is_score_valid=True,
+            reason="reasoning_content present" if reasoning_present else "reasoning_content missing",
+            data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]},
+        )
+
     finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
         metrics, finish_reason, content_str, reasoning_str
     )
 
+    # Build pass criteria - reasoning check is conditional
     all_checks_passed = (
-        reasoning_present
-        and has_multiple_tools
+        has_multiple_tools
         and all_cities_covered
         and finish_reason_tool_calls
         and finish_reason_present
@@ -3465,6 +3587,9 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval
         and no_xml_tags
         and no_reasoning_leakage
     )
+    # Only require reasoning if model supports it
+    if SUPPORTS_REASONING:
+        all_checks_passed = all_checks_passed and reasoning_present
 
     row.evaluation_result = EvaluateResult(
         score=1.0 if all_checks_passed else 0.0,
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 4c2e217b..09298a19 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -849,6 +849,11 @@ class ExecutionMetadata(BaseModel):
         description="Number of tool calls returned in the assistant message for this row.",
     )
 
+    raw_output: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Raw model output including prompt_fragments and completion_token_ids when raw_output=True is passed to the API.",
+    )
+
 
 class EvaluationRow(BaseModel):
     """
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index b8e4445d..fbaba596 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -84,6 +84,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 if "reasoning_effort" in request_params:
                     request_params.pop("reasoning_effort", None)
 
+            # Handle raw_output - move to extra_body so LiteLLM forwards it
+            if "raw_output" in request_params:
+                request_params.setdefault("extra_body", {})
+                request_params["extra_body"]["raw_output"] = request_params.pop("raw_output")
+
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
@@ -166,6 +171,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             row.execution_metadata.tool_call_count = (
                 len(converted_tool_calls) if converted_tool_calls is not None else 0
             )
+
+            # Extract raw_output if present (when raw_output=True was passed to the API)
+            # Note: raw_output is only captured for non-streaming requests
+            # LiteLLM stores extra fields in model_extra for non-streaming responses
+            choice = response.choices[0]
+            raw_output = None
+
+            # Check model_extra (where LiteLLM puts extra fields for non-streaming)
+            if hasattr(choice, "model_extra") and choice.model_extra:
+                raw_output = choice.model_extra.get("raw_output")
+            # Fallback: check as direct attribute
+            if raw_output is None:
+                raw_output = getattr(choice, "raw_output", None)
+
+            if raw_output is not None and isinstance(raw_output, dict):
+                row.execution_metadata.raw_output = raw_output
+
             usage = getattr(response, "usage", None)
             if usage:
                 row.execution_metadata.usage = (