Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions eval_protocol/cli_commands/create_rft.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,29 @@ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool:
return _validate_dataset_jsonl(dataset_jsonl)


def _warn_if_large_dataset(dataset_jsonl: Optional[str], row_threshold: int = 200) -> None:
"""Best-effort warning when local validation will run over a large dataset.

This is primarily to help users of `ep create rft` understand why local validation
might be slow and to point them at --skip-validation when appropriate.
"""
if not dataset_jsonl:
return
try:
# Count non-empty lines in the JSONL; simple full pass for clarity.
with open(dataset_jsonl, "r", encoding="utf-8") as f:
count = sum(1 for line in f if line.strip())
if count > row_threshold:
print(
f"Warning: Local evaluator validation will run over more than {row_threshold} rows from dataset JSONL at {dataset_jsonl}.\n"
" This may take a while. You can pass --skip-validation to `ep create rft` to skip local pytest-based validation "
"if you are confident in your evaluator."
)
except Exception:
# Best-effort hint only; do not block RFT creation if counting fails.
return


def _validate_evaluator_locally(
project_root: str,
selected_test_file: Optional[str],
Expand Down Expand Up @@ -791,6 +814,16 @@ def create_rft_command(args) -> int:

# 3) Optional local validation
if not skip_validation:
# Best-effort hint if the JSONL dataset is large; helps users decide to use --skip-validation.
if dataset_jsonl:
# Resolve dataset_jsonl path relative to CWD if needed (mirror upload logic).
jsonl_path_for_warning = (
dataset_jsonl
if os.path.isabs(dataset_jsonl)
else os.path.abspath(os.path.join(project_root, dataset_jsonl))
)
_warn_if_large_dataset(jsonl_path_for_warning)

# Dataset validation (JSONL must be EvaluationRow-compatible when present)
if not _validate_dataset(dataset_jsonl):
return 1
Expand Down
14 changes: 12 additions & 2 deletions eval_protocol/cli_commands/local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,17 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
def _run_pytest_host(pytest_target: str) -> int:
"""Run pytest against a target on the host and return its exit code."""
# Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
cmd = [
sys.executable,
"-m",
"pytest",
"--ep-success-threshold",
"0.001",
"--ep-num-runs",
"1",
pytest_target,
"-vs",
]
# Print the exact command being executed for easier debugging.
print("Running locally:", " ".join(cmd))
proc = subprocess.run(cmd)
Expand Down Expand Up @@ -98,7 +108,7 @@ def _run_pytest_in_docker(

# Build pytest command, always enforcing the same small success threshold as
# the host runner so that all-zero score runs fail consistently.
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", "--ep-num-runs", "1", pytest_target, "-vs"]

cmd += [image_tag] + pytest_cmd
print("Running in Docker:", " ".join(cmd))
Expand Down
25 changes: 25 additions & 0 deletions tests/test_cli_create_rft.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,31 @@ def rft_test_harness(tmp_path, monkeypatch):
return project


def test_warn_if_large_dataset_silent_for_small(tmp_path, capsys):
# Dataset with fewer rows than the threshold should not emit a warning.
ds_path = tmp_path / "small.jsonl"
ds_path.write_text('{"row":1}\n{"row":2}\n', encoding="utf-8")

cr._warn_if_large_dataset(str(ds_path), row_threshold=5)

out, err = capsys.readouterr()
assert "Warning: Local evaluator validation will run over more than" not in out
assert "Warning: Local evaluator validation will run over more than" not in err


def test_warn_if_large_dataset_emits_warning_for_large(tmp_path, capsys):
# Dataset with more rows than the threshold should emit a warning.
ds_path = tmp_path / "large.jsonl"
# 3 non-empty lines, threshold=2 -> should warn
ds_path.write_text('{"row":1}\n{"row":2}\n{"row":3}\n', encoding="utf-8")

cr._warn_if_large_dataset(str(ds_path), row_threshold=2)

out, err = capsys.readouterr()
combined = out + err
assert "Warning: Local evaluator validation will run over more than 2 rows" in combined


def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeypatch):
project = rft_test_harness

Expand Down
Loading