Skip to content

Commit 56c03e0

Browse files
author
zach
committed
docs: update readme
1 parent db55847 commit 56c03e0

File tree

2 files changed

+14
-8
lines changed

2 files changed

+14
-8
lines changed

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,16 +68,22 @@ Run an eval comparing all mcp.task runs for `my-task`:
6868
mcpx-eval test --task my-task --task-run all
6969
```
7070

71-
Run an mcp.run task locally with a different set of models:
71+
Only evaluate the latest task run:
7272

7373
```bash
74-
mcpx-eval test --model .. --model .. --task my-task --iter 10
74+
mcpx-eval test --task my-task --task-run latest
75+
```
76+
77+
Or trigger a new task run:
78+
79+
```bash
80+
mcpx-eval test --task my-task --task-run new
7581
```
7682

77-
Run the `my-test.toml` eval for 10 iterations:
83+
Run an mcp.run task locally with a different set of models:
7884

7985
```bash
80-
mcpx-eval test --model ... --model ... --config my-test.toml --iter 10
86+
mcpx-eval test --model .. --model .. --task my-task --iter 10
8187
```
8288

8389
Generate an HTML scoreboard for all evals:
@@ -92,7 +98,7 @@ A test file is a TOML file containing the following fields:
9298

9399
- `name` - name of the test
94100
- `task` - optional, the name of the mcp.run task to use
95-
- `task-run` - optional, the name or index of the task run to analyze
101+
- `task-run` - optional, one of `latest`, `new`, `all` or the name/index of the task run to analyze
96102
- `prompt` - prompt to test, this is passed to the LLM under test, this can be left blank if `task` is set
97103
- `check` - prompt for the judge, this is used to determine the quality of the test output
98104
- `expected-tools` - list of tool names that might be used

mcpx_eval/judge.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,15 +345,15 @@ async def run(
345345
model_config = ModelApiConfig.get_model_config(self.model)
346346
if task is not None:
347347
client = mcp_run.Client(config=mcp_run.ClientConfig(profile=self.profile))
348-
if task_run == "all":
348+
if task_run.lower() == "all":
349349
for run in client.list_task_runs(task):
350350
scores.append(
351351
await self._evaluate_task_run(
352352
client, run, check, expected_tools, model_config
353353
)
354354
)
355355
elif is_int(task_run) or task_run == "latest":
356-
if task_run == "latest":
356+
if task_run.lower() == "latest":
357357
task_run = -1
358358
task_run = int(task_run or -1)
359359
run = task_run_index(client, task, index=task_run)
@@ -365,7 +365,7 @@ async def run(
365365
)
366366
else:
367367
logger.error(f"Unable to load {task_run} for task {task}")
368-
elif task_run is not None and task_run != "new":
368+
elif task_run is not None and task_run.lower() != "new":
369369
found = False
370370
for run in client.list_task_runs(task):
371371
if run.name == task_run:

0 commit comments

Comments
 (0)