diff --git a/weco/api.py b/weco/api.py index af7c7cd..cc1e2f1 100644 --- a/weco/api.py +++ b/weco/api.py @@ -90,12 +90,17 @@ def resume_optimization_run( run_id: str, auth_headers: dict = {}, api_keys: Optional[Dict[str, str]] = None, + steps: Optional[int] = None, timeout: Union[int, Tuple[int, int]] = (5, 10), ) -> Optional[Dict[str, Any]]: - """Request the backend to resume an interrupted run.""" + """Request the backend to resume an interrupted or completed run. + + When ``steps`` is provided, the backend resets the run's budget to + ``last_step + steps``. Required for runs already in the ``completed`` state. + """ with console.status("[bold green]Resuming run..."): try: - return WecoClient(auth_headers).resume_run(run_id, api_keys=api_keys) + return WecoClient(auth_headers).resume_run(run_id, api_keys=api_keys, steps=steps) except requests.exceptions.HTTPError as e: handle_api_error(e, console) return None diff --git a/weco/cli.py b/weco/cli.py index 722d808..13babbd 100644 --- a/weco/cli.py +++ b/weco/cli.py @@ -353,6 +353,17 @@ def configure_resume_parser(resume_parser: argparse.ArgumentParser) -> None: action="store_true", help="Automatically apply the best solution to the source file without prompting", ) + resume_parser.add_argument( + "-n", + "--steps", + type=int, + default=None, + help=( + "Run this many more evaluations from the last node. " + "Required when resuming a completed run; optional for terminated/error runs " + "(omit to keep the original budget)." + ), + ) default_api_keys = " ".join([f"{provider}=xxx" for provider, _ in DEFAULT_MODELS]) supported_providers = ", ".join([provider for provider, _ in DEFAULT_MODELS]) @@ -575,6 +586,7 @@ def execute_resume_command(args: argparse.Namespace) -> None: output_mode=args.output, submit_timeout=getattr(args, "submit_timeout", None), auto_resume_policy=auto_resume_policy, + additional_steps=args.steps, ) sys.exit(0 if success else 1) diff --git a/weco/core/api.py b/weco/core/api.py index 8d8703a..f43b9e5 100644 --- a/weco/core/api.py +++ b/weco/core/api.py @@ -341,8 +341,12 @@ def get_lineage(self, lineage_id: str) -> dict: resp.raise_for_status() return resp.json() - def resume_run(self, run_id: str, *, api_keys: dict[str, str] | None = None) -> dict: - """``POST /runs/{run_id}/resume`` — resume an interrupted run. + def resume_run(self, run_id: str, *, api_keys: dict[str, str] | None = None, steps: int | None = None) -> dict: + """``POST /runs/{run_id}/resume`` — resume an interrupted or completed run. + + When ``steps`` is provided, the backend resets the run's budget to + ``last_step + steps`` and produces the next candidate. Required for + runs already in the ``completed`` state. Raises: requests.exceptions.HTTPError: On non-2xx responses. @@ -350,6 +354,8 @@ def resume_run(self, run_id: str, *, api_keys: dict[str, str] | None = None) -> body: dict[str, Any] = {"metadata": {"client_name": "cli", "client_version": __pkg_version__}} if api_keys: body["api_keys"] = api_keys + if steps is not None: + body["steps"] = steps resp = self._post(f"/runs/{run_id}/resume", json=body, timeout=(5, 10)) resp.raise_for_status() return resp.json() diff --git a/weco/optimizer.py b/weco/optimizer.py index a63ab35..6114f4a 100644 --- a/weco/optimizer.py +++ b/weco/optimizer.py @@ -406,9 +406,10 @@ def resume_optimization( output_mode: str = "rich", submit_timeout: Optional[int] = None, auto_resume_policy: Optional[AutoResumePolicy] = None, + additional_steps: Optional[int] = None, ) -> bool: """ - Resume an interrupted run using the queue-based optimization loop. + Resume an interrupted or completed run using the queue-based optimization loop. Polls for execution tasks, executes locally, and submits results. Uses the execution queue flow instead of the legacy direct flow. @@ -419,6 +420,8 @@ def resume_optimization( poll_interval: Seconds between polling attempts. apply_change: If True, automatically apply best solution; if False, prompt user. output_mode: "rich" for interactive terminal UI, "plain" for machine-readable output. + additional_steps: If set, run this many more evaluations from the last node. + Required when resuming a completed run; optional for terminated/error. Returns: True if optimization completed successfully, False otherwise. @@ -438,10 +441,14 @@ def resume_optimization( return False run_status_val = status.get("status") - if run_status_val not in ("error", "terminated"): + if run_status_val == "completed": + if additional_steps is None: + console.print(f"[yellow]Run {run_id} is already complete. Pass --steps N to resume with N more evaluations.[/]") + return False + elif run_status_val not in ("error", "terminated"): console.print( f"[yellow]Run {run_id} cannot be resumed (status: {run_status_val}). " - f"Only 'error' or 'terminated' runs can be resumed.[/]" + f"Only 'error', 'terminated', or 'completed' runs can be resumed.[/]" ) return False @@ -467,7 +474,17 @@ def resume_optimization( console.print(f" Objective: {metric_name} ({'maximize' if maximize else 'minimize'})") console.print(f" Model: {model_name}") console.print(f" Eval Command: {eval_command}") - console.print(f" Total Steps: {total_steps} | Current Step: {current_step} | Steps Remaining: {steps_remaining}") + if additional_steps is not None: + new_total = current_step + additional_steps + plural = "s" if additional_steps != 1 else "" + console.print( + f" Total Steps: {total_steps} -> [bold]{new_total}[/] | Current Step: {current_step} | " + f"Will run: [bold]{additional_steps}[/] more evaluation{plural} (--steps)" + ) + if new_total < total_steps: + console.print(f" [yellow]Note: this shrinks the original budget ({total_steps}) to {new_total}.[/]") + else: + console.print(f" Total Steps: {total_steps} | Current Step: {current_step} | Steps Remaining: {steps_remaining}") console.print(f" Last Updated: {status.get('updated_at', 'N/A')}") unchanged = Confirm.ask( @@ -477,11 +494,20 @@ def resume_optimization( console.print("[yellow]Resume cancelled. Please start a new run if the environment changed.[/]") return False - # Call backend to prepare resume (this sets status to 'running') - resume_resp = resume_optimization_run(console=console, run_id=run_id, auth_headers=auth_headers, api_keys=api_keys) + # Call backend to prepare resume (this sets status to 'running' and, when + # additional_steps is provided, resets run.steps to last_step + additional_steps) + resume_resp = resume_optimization_run( + console=console, run_id=run_id, auth_headers=auth_headers, api_keys=api_keys, steps=additional_steps + ) if resume_resp is None: return False + # Refresh total_steps from the resume response — when the budget was + # extended, the response carries the new total. + response_steps = resume_resp.get("steps") + if response_steps is not None: + total_steps = int(response_steps) + log_dir = resume_resp.get("log_dir", ".runs") save_logs = bool(resume_resp.get("save_logs", False)) eval_timeout = resume_resp.get("eval_timeout")