eval-protocol · xzrderek · Dec 12, 2025 · Dec 16, 2025
diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
@@ -271,6 +271,29 @@ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool:
     return _validate_dataset_jsonl(dataset_jsonl)
 
 
+def _warn_if_large_dataset(dataset_jsonl: Optional[str], row_threshold: int = 200) -> None:
+    """Best-effort warning when local validation will run over a large dataset.
+
+    This is primarily to help users of `ep create rft` understand why local validation
+    might be slow and to point them at --skip-validation when appropriate.
+    """
+    if not dataset_jsonl:
+        return
+    try:
+        # Count non-empty lines in the JSONL; simple full pass for clarity.
+        with open(dataset_jsonl, "r", encoding="utf-8") as f:
+            count = sum(1 for line in f if line.strip())
+        if count > row_threshold:
+            print(
+                f"Warning: Local evaluator validation will run over more than {row_threshold} rows from dataset JSONL at {dataset_jsonl}.\n"
+                "         This may take a while. You can pass --skip-validation to `ep create rft` to skip local pytest-based validation "
+                "if you are confident in your evaluator."
+            )
+    except Exception:
+        # Best-effort hint only; do not block RFT creation if counting fails.
+        return
+
+
 def _validate_evaluator_locally(
     project_root: str,
     selected_test_file: Optional[str],
@@ -791,6 +814,16 @@ def create_rft_command(args) -> int:
 
     # 3) Optional local validation
     if not skip_validation:
+        # Best-effort hint if the JSONL dataset is large; helps users decide to use --skip-validation.
+        if dataset_jsonl:
+            # Resolve dataset_jsonl path relative to CWD if needed (mirror upload logic).
+            jsonl_path_for_warning = (
+                dataset_jsonl
+                if os.path.isabs(dataset_jsonl)
+                else os.path.abspath(os.path.join(project_root, dataset_jsonl))
+            )
+            _warn_if_large_dataset(jsonl_path_for_warning)
+
         # Dataset validation (JSONL must be EvaluationRow-compatible when present)
         if not _validate_dataset(dataset_jsonl):
             return 1

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
@@ -36,7 +36,17 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
 def _run_pytest_host(pytest_target: str) -> int:
     """Run pytest against a target on the host and return its exit code."""
     # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
-    cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
+    cmd = [
+        sys.executable,
+        "-m",
+        "pytest",
+        "--ep-success-threshold",
+        "0.001",
+        "--ep-num-runs",
+        "1",
+        pytest_target,
+        "-vs",
+    ]
     # Print the exact command being executed for easier debugging.
     print("Running locally:", " ".join(cmd))
     proc = subprocess.run(cmd)
@@ -98,7 +108,7 @@ def _run_pytest_in_docker(
 
     # Build pytest command, always enforcing the same small success threshold as
     # the host runner so that all-zero score runs fail consistently.
-    pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
+    pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", "--ep-num-runs", "1", pytest_target, "-vs"]
 
     cmd += [image_tag] + pytest_cmd
     print("Running in Docker:", " ".join(cmd))

diff --git a/tests/test_cli_create_rft.py b/tests/test_cli_create_rft.py
@@ -45,6 +45,31 @@ def rft_test_harness(tmp_path, monkeypatch):
     return project
 
 
+def test_warn_if_large_dataset_silent_for_small(tmp_path, capsys):
+    # Dataset with fewer rows than the threshold should not emit a warning.
+    ds_path = tmp_path / "small.jsonl"
+    ds_path.write_text('{"row":1}\n{"row":2}\n', encoding="utf-8")
+
+    cr._warn_if_large_dataset(str(ds_path), row_threshold=5)
+
+    out, err = capsys.readouterr()
+    assert "Warning: Local evaluator validation will run over more than" not in out
+    assert "Warning: Local evaluator validation will run over more than" not in err
+
+
+def test_warn_if_large_dataset_emits_warning_for_large(tmp_path, capsys):
+    # Dataset with more rows than the threshold should emit a warning.
+    ds_path = tmp_path / "large.jsonl"
+    # 3 non-empty lines, threshold=2 -> should warn
+    ds_path.write_text('{"row":1}\n{"row":2}\n{"row":3}\n', encoding="utf-8")
+
+    cr._warn_if_large_dataset(str(ds_path), row_threshold=2)
+
+    out, err = capsys.readouterr()
+    combined = out + err
+    assert "Warning: Local evaluator validation will run over more than 2 rows" in combined
+
+
 def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeypatch):
     project = rft_test_harness