From b5fcba5ce0fe2cf6a4fee7272b13b95c93e62f6c Mon Sep 17 00:00:00 2001 From: shfunc Date: Mon, 9 Feb 2026 20:59:10 +0100 Subject: [PATCH 1/5] surface remote eval errors instead of silently swallowing --- hud/cli/eval.py | 6 +++++- hud/eval/manager.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 26db25ed..65487b12 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -703,6 +703,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: taskset=cfg.taskset, tasks=tasks_data, hud_eval_config=eval_cfg_dict, + strict=True, ) if cfg.taskset and ids: @@ -714,7 +715,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False): task_obj.id = task_version_id - await submit_rollouts( + trace_ids = await submit_rollouts( tasks=tasks, job_id=job_id, agent_type=cfg.agent_type, @@ -724,6 +725,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: use_byok=cfg.byok, ) + if not trace_ids: + raise ValueError("No tasks were accepted for execution. Check errors above.") + hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}") return [], tasks diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 1a4635cb..37f1ba31 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -74,8 +74,14 @@ async def _send_job_enter( taskset: str | None = None, tasks: list[dict[str, Any]] | None = None, hud_eval_config: dict[str, Any] | None = None, + strict: bool = False, ) -> list[str] | None: - """Send job enter payload (async request before traces start).""" + """Send job enter payload (async request before traces start). + + Args: + strict: If True, raise ValueError on failure instead of returning None. + Use for remote execution where job registration is required. + """ import httpx from hud.eval.types import JobEnterPayload @@ -110,7 +116,16 @@ async def _send_job_enter( ids = data.get("task_version_ids") if isinstance(ids, list) and all(isinstance(x, str) for x in ids): return ids + else: + error_detail = resp.text[:500] if resp.text else f"HTTP {resp.status_code}" + if strict: + raise ValueError(f"Job registration failed: {error_detail}") + logger.warning("Job enter failed (%d): %s", resp.status_code, error_detail) + except ValueError: + raise except Exception as e: + if strict: + raise ValueError(f"Job registration failed: {e}") from e logger.warning("Failed to send job enter: %s", e) return None From 112afffed58e4be54d2a1bab5a860c2af25a77e8 Mon Sep 17 00:00:00 2001 From: shfunc Date: Mon, 9 Feb 2026 21:07:06 +0100 Subject: [PATCH 2/5] strict fix --- hud/eval/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 37f1ba31..59edeed2 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -88,7 +88,7 @@ async def _send_job_enter( from hud.settings import settings api_key = api_key or settings.api_key - if not settings.telemetry_enabled or not api_key: + if not strict and (not settings.telemetry_enabled or not api_key): return None payload = JobEnterPayload( From bd7f86d94070bb9b1327eb484fcd8cf4ce3cb6a3 Mon Sep 17 00:00:00 2001 From: shfunc Date: Mon, 9 Feb 2026 21:15:56 +0100 Subject: [PATCH 3/5] fix --- hud/eval/manager.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 59edeed2..bd1422fd 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -111,11 +111,15 @@ async def _send_job_enter( try: data = resp.json() except Exception: + if strict: + raise ValueError("Job registration failed: invalid response body") return None if isinstance(data, dict): ids = data.get("task_version_ids") if isinstance(ids, list) and all(isinstance(x, str) for x in ids): return ids + if strict: + raise ValueError("Job registration failed: missing task_version_ids in response") else: error_detail = resp.text[:500] if resp.text else f"HTTP {resp.status_code}" if strict: From 2f66d8923b5ca30c1018bc5c683936cb6e32d04e Mon Sep 17 00:00:00 2001 From: shfunc Date: Mon, 9 Feb 2026 21:24:21 +0100 Subject: [PATCH 4/5] ruff fix --- hud/eval/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index bd1422fd..19e874e8 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -112,7 +112,7 @@ async def _send_job_enter( data = resp.json() except Exception: if strict: - raise ValueError("Job registration failed: invalid response body") + raise ValueError("Job registration failed: invalid response body") from None return None if isinstance(data, dict): ids = data.get("task_version_ids") From 9980d62ff18308f44506e4eeaa7c371ec1fe8a71 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Tue, 10 Feb 2026 15:27:57 -0800 Subject: [PATCH 5/5] always raise, strict branching not needed --- hud/cli/eval.py | 1 - hud/eval/manager.py | 54 ++++++++++++++------------------------------- 2 files changed, 17 insertions(+), 38 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 65487b12..c2c6c906 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -703,7 +703,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: taskset=cfg.taskset, tasks=tasks_data, hud_eval_config=eval_cfg_dict, - strict=True, ) if cfg.taskset and ids: diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 19e874e8..a3670db2 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -74,13 +74,11 @@ async def _send_job_enter( taskset: str | None = None, tasks: list[dict[str, Any]] | None = None, hud_eval_config: dict[str, Any] | None = None, - strict: bool = False, ) -> list[str] | None: """Send job enter payload (async request before traces start). - Args: - strict: If True, raise ValueError on failure instead of returning None. - Use for remote execution where job registration is required. + Returns task_version_ids on success, None if telemetry is disabled. + Raises on any failure (network, bad response, etc). """ import httpx @@ -88,7 +86,7 @@ async def _send_job_enter( from hud.settings import settings api_key = api_key or settings.api_key - if not strict and (not settings.telemetry_enabled or not api_key): + if not settings.telemetry_enabled or not api_key: return None payload = JobEnterPayload( @@ -100,38 +98,20 @@ async def _send_job_enter( hud_eval_config=hud_eval_config, ) - try: - async with httpx.AsyncClient(timeout=10.0) as client: - resp = await client.post( - f"{settings.hud_api_url}/trace/job/{job_id}/enter", - json=payload.model_dump(exclude_none=True), - headers={"Authorization": f"Bearer {api_key}"}, - ) - if resp.is_success: - try: - data = resp.json() - except Exception: - if strict: - raise ValueError("Job registration failed: invalid response body") from None - return None - if isinstance(data, dict): - ids = data.get("task_version_ids") - if isinstance(ids, list) and all(isinstance(x, str) for x in ids): - return ids - if strict: - raise ValueError("Job registration failed: missing task_version_ids in response") - else: - error_detail = resp.text[:500] if resp.text else f"HTTP {resp.status_code}" - if strict: - raise ValueError(f"Job registration failed: {error_detail}") - logger.warning("Job enter failed (%d): %s", resp.status_code, error_detail) - except ValueError: - raise - except Exception as e: - if strict: - raise ValueError(f"Job registration failed: {e}") from e - logger.warning("Failed to send job enter: %s", e) - return None + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post( + f"{settings.hud_api_url}/trace/job/{job_id}/enter", + json=payload.model_dump(exclude_none=True), + headers={"Authorization": f"Bearer {api_key}"}, + ) + + resp.raise_for_status() + data = resp.json() + if isinstance(data, dict): + ids = data.get("task_version_ids") + if isinstance(ids, list) and all(isinstance(x, str) for x in ids): + return ids + raise ValueError(f"Job registration failed: unexpected response: {data}") @asynccontextmanager