diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index c490f374..f0f35719 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -271,6 +271,29 @@ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool: return _validate_dataset_jsonl(dataset_jsonl) +def _warn_if_large_dataset(dataset_jsonl: Optional[str], row_threshold: int = 200) -> None: + """Best-effort warning when local validation will run over a large dataset. + + This is primarily to help users of `ep create rft` understand why local validation + might be slow and to point them at --skip-validation when appropriate. + """ + if not dataset_jsonl: + return + try: + # Count non-empty lines in the JSONL; simple full pass for clarity. + with open(dataset_jsonl, "r", encoding="utf-8") as f: + count = sum(1 for line in f if line.strip()) + if count > row_threshold: + print( + f"Warning: Local evaluator validation will run over more than {row_threshold} rows from dataset JSONL at {dataset_jsonl}.\n" + " This may take a while. You can pass --skip-validation to `ep create rft` to skip local pytest-based validation " + "if you are confident in your evaluator." + ) + except Exception: + # Best-effort hint only; do not block RFT creation if counting fails. + return + + def _validate_evaluator_locally( project_root: str, selected_test_file: Optional[str], @@ -791,6 +814,16 @@ def create_rft_command(args) -> int: # 3) Optional local validation if not skip_validation: + # Best-effort hint if the JSONL dataset is large; helps users decide to use --skip-validation. + if dataset_jsonl: + # Resolve dataset_jsonl path relative to CWD if needed (mirror upload logic). + jsonl_path_for_warning = ( + dataset_jsonl + if os.path.isabs(dataset_jsonl) + else os.path.abspath(os.path.join(project_root, dataset_jsonl)) + ) + _warn_if_large_dataset(jsonl_path_for_warning) + # Dataset validation (JSONL must be EvaluationRow-compatible when present) if not _validate_dataset(dataset_jsonl): return 1 diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index 97e02e9f..6e082f64 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -36,7 +36,17 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List def _run_pytest_host(pytest_target: str) -> int: """Run pytest against a target on the host and return its exit code.""" # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail. - cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"] + cmd = [ + sys.executable, + "-m", + "pytest", + "--ep-success-threshold", + "0.001", + "--ep-num-runs", + "1", + pytest_target, + "-vs", + ] # Print the exact command being executed for easier debugging. print("Running locally:", " ".join(cmd)) proc = subprocess.run(cmd) @@ -98,7 +108,7 @@ def _run_pytest_in_docker( # Build pytest command, always enforcing the same small success threshold as # the host runner so that all-zero score runs fail consistently. - pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"] + pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", "--ep-num-runs", "1", pytest_target, "-vs"] cmd += [image_tag] + pytest_cmd print("Running in Docker:", " ".join(cmd)) diff --git a/tests/test_cli_create_rft.py b/tests/test_cli_create_rft.py index 71f2a064..31e7f7df 100644 --- a/tests/test_cli_create_rft.py +++ b/tests/test_cli_create_rft.py @@ -45,6 +45,31 @@ def rft_test_harness(tmp_path, monkeypatch): return project +def test_warn_if_large_dataset_silent_for_small(tmp_path, capsys): + # Dataset with fewer rows than the threshold should not emit a warning. + ds_path = tmp_path / "small.jsonl" + ds_path.write_text('{"row":1}\n{"row":2}\n', encoding="utf-8") + + cr._warn_if_large_dataset(str(ds_path), row_threshold=5) + + out, err = capsys.readouterr() + assert "Warning: Local evaluator validation will run over more than" not in out + assert "Warning: Local evaluator validation will run over more than" not in err + + +def test_warn_if_large_dataset_emits_warning_for_large(tmp_path, capsys): + # Dataset with more rows than the threshold should emit a warning. + ds_path = tmp_path / "large.jsonl" + # 3 non-empty lines, threshold=2 -> should warn + ds_path.write_text('{"row":1}\n{"row":2}\n{"row":3}\n', encoding="utf-8") + + cr._warn_if_large_dataset(str(ds_path), row_threshold=2) + + out, err = capsys.readouterr() + combined = out + err + assert "Warning: Local evaluator validation will run over more than 2 rows" in combined + + def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeypatch): project = rft_test_harness