@@ -711,16 +711,18 @@ def _debug_log_assistant_message(test_name: str, assistant_message: Message | No
711711@evaluation_test (
712712 input_rows = [[STRUCTURED_OUTPUT_ROW ]],
713713 completion_params = [
714- {
715- "model" : DEFAULT_MODEL_ID ,
716- "stream" : True ,
717- "temperature" : 1.0 ,
718- "top_p" : 1.0 ,
719- "max_tokens" : DEFAULT_MAX_TOKENS ,
720- "response_format" : STRUCTURED_RESPONSE_FORMAT ,
721- "reasoning_effort" : "none" , # No reasoning expected for structured output
722- "raw_output" : True , # Include raw model output for debugging
723- }
714+ _maybe_add_reasoning_effort (
715+ {
716+ "model" : DEFAULT_MODEL_ID ,
717+ "stream" : True ,
718+ "temperature" : 1.0 ,
719+ "top_p" : 1.0 ,
720+ "max_tokens" : DEFAULT_MAX_TOKENS ,
721+ "response_format" : STRUCTURED_RESPONSE_FORMAT ,
722+ "raw_output" : True , # Include raw model output for debugging
723+ },
724+ "none" , # No reasoning expected for structured output
725+ )
724726 ],
725727 rollout_processor = SingleTurnRolloutProcessor (),
726728 aggregation_method = "mean" ,
@@ -981,15 +983,17 @@ def test_streaming_json_preservation(row: EvaluationRow) -> EvaluationRow:
981983@evaluation_test (
982984 input_rows = [[TOOL_CALL_ROW ]],
983985 completion_params = [
984- {
985- "model" : DEFAULT_MODEL_ID ,
986- "stream" : True ,
987- "temperature" : 1.0 ,
988- "top_p" : 1.0 ,
989- "max_tokens" : DEFAULT_MAX_TOKENS ,
990- "reasoning_effort" : "none" , # No reasoning expected for tool calls
991- "raw_output" : True , # Include raw model output for debugging
992- }
986+ _maybe_add_reasoning_effort (
987+ {
988+ "model" : DEFAULT_MODEL_ID ,
989+ "stream" : True ,
990+ "temperature" : 1.0 ,
991+ "top_p" : 1.0 ,
992+ "max_tokens" : DEFAULT_MAX_TOKENS ,
993+ "raw_output" : True , # Include raw model output for debugging
994+ },
995+ "none" , # No reasoning expected for tool calls
996+ )
993997 ],
994998 rollout_processor = SingleTurnRolloutProcessor (),
995999 aggregation_method = "mean" ,
@@ -2264,16 +2268,18 @@ async def test_streaming_output_consistency(row: EvaluationRow) -> EvaluationRow
22642268@evaluation_test (
22652269 input_rows = [[STRUCTURED_OUTPUT_ROW ]],
22662270 completion_params = [
2267- {
2268- "model" : DEFAULT_MODEL_ID ,
2269- "stream" : False , # Non-streaming
2270- "temperature" : 1.0 ,
2271- "top_p" : 1.0 ,
2272- "max_tokens" : DEFAULT_MAX_TOKENS ,
2273- "response_format" : STRUCTURED_RESPONSE_FORMAT ,
2274- "reasoning_effort" : "none" ,
2275- "raw_output" : True , # Include raw model output for debugging
2276- }
2271+ _maybe_add_reasoning_effort (
2272+ {
2273+ "model" : DEFAULT_MODEL_ID ,
2274+ "stream" : False , # Non-streaming
2275+ "temperature" : 1.0 ,
2276+ "top_p" : 1.0 ,
2277+ "max_tokens" : DEFAULT_MAX_TOKENS ,
2278+ "response_format" : STRUCTURED_RESPONSE_FORMAT ,
2279+ "raw_output" : True , # Include raw model output for debugging
2280+ },
2281+ "none" ,
2282+ )
22772283 ],
22782284 rollout_processor = SingleTurnRolloutProcessor (),
22792285 aggregation_method = "mean" ,
@@ -2473,15 +2479,17 @@ def test_non_streaming_simple_completion(row: EvaluationRow) -> EvaluationRow:
24732479@evaluation_test (
24742480 input_rows = [[TOOL_CALL_NON_STREAM_ROW ]],
24752481 completion_params = [
2476- {
2477- "model" : DEFAULT_MODEL_ID ,
2478- "stream" : False , # Non-streaming
2479- "temperature" : 1.0 ,
2480- "top_p" : 1.0 ,
2481- "max_tokens" : DEFAULT_MAX_TOKENS ,
2482- "reasoning_effort" : "none" ,
2483- "raw_output" : True , # Include raw model output for debugging
2484- }
2482+ _maybe_add_reasoning_effort (
2483+ {
2484+ "model" : DEFAULT_MODEL_ID ,
2485+ "stream" : False , # Non-streaming
2486+ "temperature" : 1.0 ,
2487+ "top_p" : 1.0 ,
2488+ "max_tokens" : DEFAULT_MAX_TOKENS ,
2489+ "raw_output" : True , # Include raw model output for debugging
2490+ },
2491+ "none" ,
2492+ )
24852493 ],
24862494 rollout_processor = SingleTurnRolloutProcessor (),
24872495 aggregation_method = "mean" ,
0 commit comments