docs: update readme

zach · zach · commit 56c03e0fd287 · 2025-04-16T11:44:01.000-07:00
diff --git a/README.md b/README.md
@@ -68,16 +68,22 @@ Run an eval comparing all mcp.task runs for `my-task`:
 mcpx-eval test --task my-task --task-run all
 ```
 
-Run an mcp.run task locally with a different set of models:
+Only evaluate the latest task run:
 
 ```bash
-mcpx-eval test --model .. --model .. --task my-task --iter 10
+mcpx-eval test --task my-task --task-run latest
+```
+
+Or trigger a new task run:
+
+```bash
+mcpx-eval test --task my-task --task-run new
 ```
 
-Run the `my-test.toml` eval for 10 iterations:
+Run an mcp.run task locally with a different set of models:
 
 ```bash
-mcpx-eval test --model ... --model ... --config my-test.toml --iter 10
+mcpx-eval test --model .. --model .. --task my-task --iter 10
 ```
 
 Generate an HTML scoreboard for all evals:
@@ -92,7 +98,7 @@ A test file is a TOML file containing the following fields:
 
 - `name` - name of the test
 - `task` - optional, the name of the mcp.run task to use
-- `task-run` - optional, the name or index of the task run to analyze
+- `task-run` - optional, one of `latest`, `new`, `all` or the name/index of the task run to analyze
 - `prompt` - prompt to test, this is passed to the LLM under test, this can be left blank if `task` is set
 - `check` - prompt for the judge, this is used to determine the quality of the test output 
 - `expected-tools` - list of tool names that might be used
diff --git a/mcpx_eval/judge.py b/mcpx_eval/judge.py
@@ -345,15 +345,15 @@ async def run(
         model_config = ModelApiConfig.get_model_config(self.model)
         if task is not None:
             client = mcp_run.Client(config=mcp_run.ClientConfig(profile=self.profile))
-            if task_run == "all":
+            if task_run.lower() == "all":
                 for run in client.list_task_runs(task):
                     scores.append(
                         await self._evaluate_task_run(
                             client, run, check, expected_tools, model_config
                         )
                     )
             elif is_int(task_run) or task_run == "latest":
-                if task_run == "latest":
+                if task_run.lower() == "latest":
                     task_run = -1
                 task_run = int(task_run or -1)
                 run = task_run_index(client, task, index=task_run)
@@ -365,7 +365,7 @@ async def run(
                     )
                 else:
                     logger.error(f"Unable to load {task_run} for task {task}")
-            elif task_run is not None and task_run != "new":
+            elif task_run is not None and task_run.lower() != "new":
                 found = False
                 for run in client.list_task_runs(task):
                     if run.name == task_run: