Skip to content

Commit 5ea7e08

Browse files
committed
revert ep.make back
1 parent 00a12fb commit 5ea7e08

File tree

16 files changed

+67
-68
lines changed

16 files changed

+67
-68
lines changed

eval_protocol/mcp/execution/manager.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,10 @@ async def _execute_rollout(
478478
logger.error(f"🚨 Error in rollout {rollout_idx}: {e}", exc_info=True)
479479
failure_reason = str(e)
480480
finally:
481-
trajectory.terminated = True
482-
trajectory.termination_reason = TerminationReason.ERROR
483-
trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
481+
if failure_reason:
482+
trajectory.terminated = True
483+
trajectory.termination_reason = TerminationReason.ERROR
484+
trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
484485
try:
485486
await envs.connection_manager.reset_session(session)
486487
except:

eval_protocol/mcp_env.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
1818
1919
# Create environments with evaluation_rows configuration
20-
envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
20+
envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
2121
2222
# Execute tool-calling rollouts
2323
evaluation_rows = await ep.rollout(envs, policy=policy, steps=512)
@@ -89,15 +89,14 @@ async def reset_mcp_sessions(envs: GeneralMCPVectorEnv):
8989
await asyncio.gather(*tasks, return_exceptions=True)
9090

9191

92-
async def make(
92+
def make(
9393
env_spec: str,
9494
evaluation_rows: Optional[List[EvaluationRow]] = None,
9595
dataset: Optional[List[Dict]] = None,
9696
n: Optional[int] = None,
9797
seeds: Optional[List[int]] = None,
9898
model_id: str = "unknown",
9999
user_prompt_formatter: Optional[Callable] = None,
100-
reset_sessions: bool = False,
101100
) -> GeneralMCPVectorEnv:
102101
"""
103102
Create general MCP environments driven by evaluation_rows configuration.
@@ -110,20 +109,19 @@ async def make(
110109
seeds: List of seeds (for backward compatibility)
111110
model_id: Model identifier
112111
user_prompt_formatter: Optional callback for formatting user prompts
113-
reset_sessions: Whether to reset sessions before returning the environment
114112
115113
Returns:
116114
General MCP environment that works with any MCP server
117115
118116
Example:
119117
# EvaluationRow approach (preferred)
120-
envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
118+
envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
121119
122120
# Dataset approach (backward compatibility)
123-
envs = await ep.make("http://localhost:8000/mcp", dataset=dataset)
121+
envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
124122
125123
# Legacy approach (backward compatibility)
126-
envs = await ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
124+
envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
127125
"""
128126
# Parse environment specification - make sure URL format is correct
129127
base_url = env_spec

eval_protocol/pytest/default_mcp_gym_rollout_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ async def default_mcp_gym_rollout_processor(
213213
)
214214

215215
# Create MCP environments directly from evaluation_rows
216-
envs = await ep.make(
216+
envs = ep.make(
217217
"http://localhost:9700/mcp/",
218218
evaluation_rows=rows,
219219
model_id=policy.model_id,

examples/blackjack_mcp/tests/test_record_and_replay_e2e.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ async def test_production_server_record_and_replay(production_server, blackjack_
215215
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
216216

217217
# Create environments for playback
218-
playback_envs = await ep.make(
218+
playback_envs = ep.make(
219219
"http://localhost:9500/mcp/",
220220
dataset=blackjack_dataset,
221221
model_id=playback_policy.model_id,
@@ -250,7 +250,7 @@ async def test_production_server_record_and_replay(production_server, blackjack_
250250
assert not policy.is_playback_mode(), "Should be in recording mode initially"
251251

252252
# Create environments
253-
envs = await ep.make(
253+
envs = ep.make(
254254
"http://localhost:9500/mcp/",
255255
dataset=blackjack_dataset,
256256
model_id=policy.model_id,
@@ -310,7 +310,7 @@ async def test_production_server_record_and_replay(production_server, blackjack_
310310
assert playback_policy.is_playback_mode(), "Should be in playback mode"
311311

312312
# Create new environments for playback
313-
playback_envs = await ep.make(
313+
playback_envs = ep.make(
314314
"http://localhost:9500/mcp/",
315315
dataset=blackjack_dataset,
316316
model_id=playback_policy.model_id,
@@ -462,7 +462,7 @@ async def test_blackjack_step_by_step(conda_isolation_recording_file):
462462
]
463463

464464
# Create environment pointing to conda-isolated server
465-
envs = await ep.make(
465+
envs = ep.make(
466466
f"http://localhost:{port}/mcp/",
467467
dataset=test_dataset,
468468
model_id=policy.model_id,
@@ -570,7 +570,7 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording
570570
policy = create_blackjack_static_policy(action_sequence=["HIT", "HIT", "STICK"])
571571

572572
# Create multiple environments
573-
envs = await ep.make(
573+
envs = ep.make(
574574
f"http://localhost:{server.port}/mcp/",
575575
dataset=multi_env_dataset,
576576
model_id=policy.model_id,
@@ -992,7 +992,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
992992
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
993993

994994
# Create environments for playback
995-
playback_envs = await ep.make(
995+
playback_envs = ep.make(
996996
"http://localhost:9500/mcp/",
997997
dataset=multi_env_dataset,
998998
model_id=playback_policy.model_id,
@@ -1033,7 +1033,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
10331033
assert not policy.is_playback_mode(), "Should be in recording mode initially"
10341034

10351035
# Create multiple environments
1036-
envs = await ep.make(
1036+
envs = ep.make(
10371037
f"http://localhost:{server.port}/mcp/",
10381038
dataset=multi_env_dataset,
10391039
model_id=policy.model_id,
@@ -1149,7 +1149,7 @@ async def test_control_plane_state_querying(multi_env_dataset):
11491149
policy = create_blackjack_static_policy(action_sequence=["HIT", "STAND"])
11501150

11511151
# Create environments
1152-
envs = await ep.make(
1152+
envs = ep.make(
11531153
f"http://localhost:{server.port}/mcp/",
11541154
dataset=multi_env_dataset[:2], # Use only 2 environments for faster testing
11551155
model_id=policy.model_id,

examples/cliff_walking_mcp/tests/test_cliff_walking_e2e.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ async def test_production_server_record_and_replay(
224224
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
225225

226226
# Create environments for playback
227-
playback_envs = await ep.make(
227+
playback_envs = ep.make(
228228
"http://localhost:9500/mcp/",
229229
dataset=cliff_walking_dataset,
230230
model_id=playback_policy.model_id,
@@ -259,7 +259,7 @@ async def test_production_server_record_and_replay(
259259
assert not policy.is_playback_mode(), "Should be in recording mode initially"
260260

261261
# Create environments
262-
envs = await ep.make(
262+
envs = ep.make(
263263
"http://localhost:9500/mcp/",
264264
dataset=cliff_walking_dataset,
265265
model_id=policy.model_id,
@@ -318,7 +318,7 @@ async def test_production_server_record_and_replay(
318318
assert playback_policy.is_playback_mode(), "Should be in playback mode"
319319

320320
# Create new environments for playback
321-
playback_envs = await ep.make(
321+
playback_envs = ep.make(
322322
"http://localhost:9500/mcp/",
323323
dataset=cliff_walking_dataset,
324324
model_id=playback_policy.model_id,
@@ -471,7 +471,7 @@ async def test_cliff_walking_step_by_step(conda_isolation_recording_file):
471471
]
472472

473473
# Create environment pointing to conda-isolated server
474-
envs = await ep.make(
474+
envs = ep.make(
475475
f"http://localhost:{port}/mcp/",
476476
dataset=test_dataset,
477477
model_id=policy.model_id,
@@ -589,7 +589,7 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording
589589
)
590590

591591
# Create multiple environments
592-
envs = await ep.make(
592+
envs = ep.make(
593593
f"http://localhost:{server.port}/mcp/",
594594
dataset=multi_env_dataset,
595595
model_id=policy.model_id,
@@ -1018,7 +1018,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
10181018
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
10191019

10201020
# Create environments for playback
1021-
playback_envs = await ep.make(
1021+
playback_envs = ep.make(
10221022
"http://localhost:9500/mcp/",
10231023
dataset=multi_env_dataset,
10241024
model_id=playback_policy.model_id,
@@ -1059,7 +1059,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
10591059
assert not policy.is_playback_mode(), "Should be in recording mode initially"
10601060

10611061
# Create multiple environments
1062-
envs = await ep.make(
1062+
envs = ep.make(
10631063
f"http://localhost:{server.port}/mcp/",
10641064
dataset=multi_env_dataset,
10651065
model_id=policy.model_id,
@@ -1178,7 +1178,7 @@ async def test_control_plane_state_querying(multi_env_dataset):
11781178
policy = create_cliff_walking_static_policy(action_sequence=["UP", "UP"])
11791179

11801180
# Create environments
1181-
envs = await ep.make(
1181+
envs = ep.make(
11821182
f"http://localhost:{server.port}/mcp/",
11831183
dataset=multi_env_dataset[:2], # Use only 2 environments for faster testing
11841184
model_id=policy.model_id,

examples/frozen_lake_mcp/test_basic_functionality.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ async def test_basic_server_functionality():
4646
policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b", temperature=0.2)
4747

4848
# Create environment pointing to local server
49-
envs = await ep.make("http://localhost:8000/mcp/", dataset=test_dataset, model_id=policy.model_id)
49+
envs = ep.make("http://localhost:8000/mcp/", dataset=test_dataset, model_id=policy.model_id)
5050
print("✅ Successfully connected to MCP server")
5151

5252
# Test 2: Try to make tool calls (we'll simulate this for now)

examples/frozen_lake_mcp/test_multi_session.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ async def test_multi_session():
6060

6161
try:
6262
# Create environments (assumes server is running on localhost:8000)
63-
envs = await ep.make(
63+
envs = ep.make(
6464
"http://localhost:8000/mcp/",
6565
dataset=test_dataset,
6666
model_id=policy.model_id,

examples/frozen_lake_mcp/test_seed_logging.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def test_seed_logging():
3030
try:
3131
# Create environment pointing to our server
3232
print("🔌 Connecting to server...")
33-
envs = await ep.make("http://localhost:9600/mcp/", dataset=dataset, model_id="test")
33+
envs = ep.make("http://localhost:9600/mcp/", dataset=dataset, model_id="test")
3434
print(f"✅ Created envs: {len(envs.sessions)} sessions")
3535

3636
# Reset environments to trigger session creation

examples/frozen_lake_mcp/tests/test_frozen_lake_e2e.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ async def test_production_server_record_and_replay(production_server, frozen_lak
232232
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
233233

234234
# Create environments for playback
235-
playback_envs = await ep.make(
235+
playback_envs = ep.make(
236236
"http://localhost:9500/mcp/",
237237
dataset=frozen_lake_dataset,
238238
model_id=playback_policy.model_id,
@@ -268,7 +268,7 @@ async def test_production_server_record_and_replay(production_server, frozen_lak
268268
assert not policy.is_playback_mode(), "Should be in recording mode initially"
269269

270270
# Create environments
271-
envs = await ep.make(
271+
envs = ep.make(
272272
"http://localhost:9500/mcp/",
273273
dataset=frozen_lake_dataset,
274274
model_id=policy.model_id,
@@ -335,7 +335,7 @@ async def test_production_server_record_and_replay(production_server, frozen_lak
335335
assert playback_policy.is_playback_mode(), "Should be in playback mode"
336336

337337
# Create new environments for playback
338-
playback_envs = await ep.make(
338+
playback_envs = ep.make(
339339
"http://localhost:9500/mcp/",
340340
dataset=frozen_lake_dataset,
341341
model_id=playback_policy.model_id,
@@ -488,7 +488,7 @@ async def test_frozen_lake_step_by_step(conda_isolation_recording_file):
488488
]
489489

490490
# Create environment pointing to conda-isolated server
491-
envs = await ep.make(
491+
envs = ep.make(
492492
f"http://localhost:{port}/mcp/",
493493
dataset=test_dataset,
494494
model_id=policy.model_id,
@@ -593,7 +593,7 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording
593593
policy = create_frozen_lake_static_policy(action_sequence=["RIGHT", "RIGHT", "RIGHT", "DOWN", "DOWN", "DOWN"])
594594

595595
# Create multiple environments
596-
envs = await ep.make(
596+
envs = ep.make(
597597
f"http://localhost:{server.port}/mcp/",
598598
dataset=multi_env_dataset,
599599
model_id=policy.model_id,
@@ -1071,7 +1071,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
10711071
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
10721072

10731073
# Create environments for playback
1074-
playback_envs = await ep.make(
1074+
playback_envs = ep.make(
10751075
"http://localhost:9500/mcp/",
10761076
dataset=multi_env_dataset,
10771077
model_id=playback_policy.model_id,
@@ -1113,7 +1113,7 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks
11131113
assert not policy.is_playback_mode(), "Should be in recording mode initially"
11141114

11151115
# Create multiple environments
1116-
envs = await ep.make(
1116+
envs = ep.make(
11171117
f"http://localhost:{server.port}/mcp/",
11181118
dataset=multi_env_dataset,
11191119
model_id=policy.model_id,
@@ -1232,7 +1232,7 @@ async def test_control_plane_state_querying(multi_env_dataset):
12321232
policy = create_frozen_lake_static_policy(action_sequence=["RIGHT", "DOWN"])
12331233

12341234
# Create environments
1235-
envs = await ep.make(
1235+
envs = ep.make(
12361236
f"http://localhost:{server.port}/mcp/",
12371237
dataset=multi_env_dataset[:2], # Use only 2 environments for faster testing
12381238
model_id=policy.model_id,
@@ -1283,7 +1283,7 @@ async def _run_playback_only(recording_file: str, dataset: List[Dict], server_ur
12831283
assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
12841284

12851285
# Create environments for playback
1286-
playback_envs = await ep.make(
1286+
playback_envs = ep.make(
12871287
server_url,
12881288
dataset=dataset,
12891289
model_id=playback_policy.model_id,

examples/lunar_lander_mcp/test_lunar_lander_conda.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ async def test_lunar_lander_with_conda_isolation():
119119
]
120120

121121
# Configure for MCP environment
122-
envs = await ep.make("http://localhost:9004/mcp", dataset=dataset)
122+
envs = ep.make("http://localhost:9004/mcp", dataset=dataset)
123123

124124
# Simple policy that takes random actions
125125
class RandomLunarLanderPolicy:

0 commit comments

Comments
 (0)