Skip to content

Commit 1e60a66

Browse files
committed
update2
1 parent ffadf69 commit 1e60a66

File tree

1 file changed

+46
-38
lines changed

1 file changed

+46
-38
lines changed

eval_protocol/benchmarks/test_glm_streaming_compliance.py

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -711,16 +711,18 @@ def _debug_log_assistant_message(test_name: str, assistant_message: Message | No
711711
@evaluation_test(
712712
input_rows=[[STRUCTURED_OUTPUT_ROW]],
713713
completion_params=[
714-
{
715-
"model": DEFAULT_MODEL_ID,
716-
"stream": True,
717-
"temperature": 1.0,
718-
"top_p": 1.0,
719-
"max_tokens": DEFAULT_MAX_TOKENS,
720-
"response_format": STRUCTURED_RESPONSE_FORMAT,
721-
"reasoning_effort": "none", # No reasoning expected for structured output
722-
"raw_output": True, # Include raw model output for debugging
723-
}
714+
_maybe_add_reasoning_effort(
715+
{
716+
"model": DEFAULT_MODEL_ID,
717+
"stream": True,
718+
"temperature": 1.0,
719+
"top_p": 1.0,
720+
"max_tokens": DEFAULT_MAX_TOKENS,
721+
"response_format": STRUCTURED_RESPONSE_FORMAT,
722+
"raw_output": True, # Include raw model output for debugging
723+
},
724+
"none", # No reasoning expected for structured output
725+
)
724726
],
725727
rollout_processor=SingleTurnRolloutProcessor(),
726728
aggregation_method="mean",
@@ -981,15 +983,17 @@ def test_streaming_json_preservation(row: EvaluationRow) -> EvaluationRow:
981983
@evaluation_test(
982984
input_rows=[[TOOL_CALL_ROW]],
983985
completion_params=[
984-
{
985-
"model": DEFAULT_MODEL_ID,
986-
"stream": True,
987-
"temperature": 1.0,
988-
"top_p": 1.0,
989-
"max_tokens": DEFAULT_MAX_TOKENS,
990-
"reasoning_effort": "none", # No reasoning expected for tool calls
991-
"raw_output": True, # Include raw model output for debugging
992-
}
986+
_maybe_add_reasoning_effort(
987+
{
988+
"model": DEFAULT_MODEL_ID,
989+
"stream": True,
990+
"temperature": 1.0,
991+
"top_p": 1.0,
992+
"max_tokens": DEFAULT_MAX_TOKENS,
993+
"raw_output": True, # Include raw model output for debugging
994+
},
995+
"none", # No reasoning expected for tool calls
996+
)
993997
],
994998
rollout_processor=SingleTurnRolloutProcessor(),
995999
aggregation_method="mean",
@@ -2264,16 +2268,18 @@ async def test_streaming_output_consistency(row: EvaluationRow) -> EvaluationRow
22642268
@evaluation_test(
22652269
input_rows=[[STRUCTURED_OUTPUT_ROW]],
22662270
completion_params=[
2267-
{
2268-
"model": DEFAULT_MODEL_ID,
2269-
"stream": False, # Non-streaming
2270-
"temperature": 1.0,
2271-
"top_p": 1.0,
2272-
"max_tokens": DEFAULT_MAX_TOKENS,
2273-
"response_format": STRUCTURED_RESPONSE_FORMAT,
2274-
"reasoning_effort": "none",
2275-
"raw_output": True, # Include raw model output for debugging
2276-
}
2271+
_maybe_add_reasoning_effort(
2272+
{
2273+
"model": DEFAULT_MODEL_ID,
2274+
"stream": False, # Non-streaming
2275+
"temperature": 1.0,
2276+
"top_p": 1.0,
2277+
"max_tokens": DEFAULT_MAX_TOKENS,
2278+
"response_format": STRUCTURED_RESPONSE_FORMAT,
2279+
"raw_output": True, # Include raw model output for debugging
2280+
},
2281+
"none",
2282+
)
22772283
],
22782284
rollout_processor=SingleTurnRolloutProcessor(),
22792285
aggregation_method="mean",
@@ -2473,15 +2479,17 @@ def test_non_streaming_simple_completion(row: EvaluationRow) -> EvaluationRow:
24732479
@evaluation_test(
24742480
input_rows=[[TOOL_CALL_NON_STREAM_ROW]],
24752481
completion_params=[
2476-
{
2477-
"model": DEFAULT_MODEL_ID,
2478-
"stream": False, # Non-streaming
2479-
"temperature": 1.0,
2480-
"top_p": 1.0,
2481-
"max_tokens": DEFAULT_MAX_TOKENS,
2482-
"reasoning_effort": "none",
2483-
"raw_output": True, # Include raw model output for debugging
2484-
}
2482+
_maybe_add_reasoning_effort(
2483+
{
2484+
"model": DEFAULT_MODEL_ID,
2485+
"stream": False, # Non-streaming
2486+
"temperature": 1.0,
2487+
"top_p": 1.0,
2488+
"max_tokens": DEFAULT_MAX_TOKENS,
2489+
"raw_output": True, # Include raw model output for debugging
2490+
},
2491+
"none",
2492+
)
24852493
],
24862494
rollout_processor=SingleTurnRolloutProcessor(),
24872495
aggregation_method="mean",

0 commit comments

Comments
 (0)