Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .test/src/skill_test/agent/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,7 @@ def _load_mcp_config() -> dict[str, Any]:
resolved_cfg[key] = val.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root))
elif isinstance(val, list):
resolved_cfg[key] = [
v.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root)) if isinstance(v, str) else v
for v in val
v.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root)) if isinstance(v, str) else v for v in val
]
else:
resolved_cfg[key] = val
Expand Down Expand Up @@ -283,7 +282,11 @@ def _get_agent_env() -> dict[str, str]:

# 2. Env vars with known prefixes override settings file values
# Skip internal Claude Code vars that would confuse the subprocess
_skip_keys = {"CLAUDE_CODE_SSE_PORT", "CLAUDE_CODE_ENTRYPOINT", "CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY"}
_skip_keys = {
"CLAUDE_CODE_SSE_PORT",
"CLAUDE_CODE_ENTRYPOINT",
"CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY",
}
for key, value in os.environ.items():
if key in _skip_keys:
continue
Expand Down Expand Up @@ -510,7 +513,7 @@ async def run_agent(
# Pass Databricks auth env vars to MCP server processes
if mcp_config:
mcp_env = {k: v for k, v in env.items() if k.startswith(("DATABRICKS_",))}
for server_name, server_cfg in mcp_config.items():
for _server_name, server_cfg in mcp_config.items():
if "env" not in server_cfg and mcp_env:
server_cfg["env"] = mcp_env

Expand Down
67 changes: 38 additions & 29 deletions .test/src/skill_test/optimize/agent_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,9 @@ def __init__(
)

# --- Field-based judges (fallback — when mlflow_trace is None) ---
self._field_correctness_judge = create_correctness_judge(
skill_guidelines, judge_model=judge_model
)
self._field_completeness_judge = create_completeness_judge(
judge_model=judge_model
)
self._field_guideline_judge = create_guideline_adherence_judge(
skill_guidelines, judge_model=judge_model
)
self._field_correctness_judge = create_correctness_judge(skill_guidelines, judge_model=judge_model)
self._field_completeness_judge = create_completeness_judge(judge_model=judge_model)
self._field_guideline_judge = create_guideline_adherence_judge(skill_guidelines, judge_model=judge_model)

self._regression_judge = create_regression_judge(judge_model=judge_model)

Expand Down Expand Up @@ -300,8 +294,7 @@ def _evaluate(
facts_str = "\n".join(f"- {f}" for f in facts) if facts else "None specified"
patterns_str = (
"\n".join(
f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}"
for p in patterns
f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" for p in patterns
)
if patterns
else "None specified"
Expand All @@ -321,8 +314,12 @@ def _evaluate(
# Circuit breaker: after first failure, skip trace judges entirely
# to avoid wasting API calls on a model that can't handle them.
def _judge_with_fallback(
trace_judge, field_judge, *,
mlflow_trace, response_text, judge_name,
trace_judge,
field_judge,
*,
mlflow_trace,
response_text,
judge_name,
) -> JudgeFeedback:
"""Try trace-based judge, fall back to field-based on failure."""
with self._cache_lock:
Expand Down Expand Up @@ -357,7 +354,8 @@ def _judge_with_fallback(

# Correctness: WITH + WITHOUT (WITHOUT cached)
correctness_with_fb = _judge_with_fallback(
self._trace_correctness_judge, self._field_correctness_judge,
self._trace_correctness_judge,
self._field_correctness_judge,
mlflow_trace=with_result.mlflow_trace,
response_text=with_response,
judge_name="correctness_with",
Expand All @@ -366,7 +364,8 @@ def _judge_with_fallback(
need_correctness_baseline = baseline_key not in self._baseline_correctness_cache
if need_correctness_baseline:
fb = _judge_with_fallback(
self._trace_correctness_judge, self._field_correctness_judge,
self._trace_correctness_judge,
self._field_correctness_judge,
mlflow_trace=without_mlflow_trace,
response_text=without_response,
judge_name="correctness_without",
Expand All @@ -379,7 +378,8 @@ def _judge_with_fallback(

# Completeness: WITH + WITHOUT (WITHOUT cached)
completeness_with_fb = _judge_with_fallback(
self._trace_completeness_judge, self._field_completeness_judge,
self._trace_completeness_judge,
self._field_completeness_judge,
mlflow_trace=with_result.mlflow_trace,
response_text=with_response,
judge_name="completeness_with",
Expand All @@ -388,7 +388,8 @@ def _judge_with_fallback(
need_completeness_baseline = baseline_key not in self._baseline_completeness_cache
if need_completeness_baseline:
fb = _judge_with_fallback(
self._trace_completeness_judge, self._field_completeness_judge,
self._trace_completeness_judge,
self._field_completeness_judge,
mlflow_trace=without_mlflow_trace,
response_text=without_response,
judge_name="completeness_without",
Expand All @@ -401,7 +402,8 @@ def _judge_with_fallback(

# Guideline adherence: WITH only
guideline_adherence_fb = _judge_with_fallback(
self._trace_guideline_judge, self._field_guideline_judge,
self._trace_guideline_judge,
self._field_guideline_judge,
mlflow_trace=with_result.mlflow_trace,
response_text=with_response,
judge_name="guideline_adherence",
Expand Down Expand Up @@ -444,7 +446,10 @@ def _judge_with_fallback(
reg_val = regression_fb.value
if isinstance(reg_val, bool):
regression_penalty = 1.0 if reg_val else 0.0
elif isinstance(reg_val, str) and reg_val.strip().lower() in ("yes", "true"):
elif isinstance(reg_val, str) and reg_val.strip().lower() in (
"yes",
"true",
):
regression_penalty = 1.0

# Phase 4: Deterministic fact/pattern assertions (zero LLM cost — static spine)
Expand Down Expand Up @@ -481,16 +486,20 @@ def _judge_with_fallback(
quality_composite = (correctness_with + completeness_with + guideline_adherence_score) / 3.0
assertion_coverage = 0.5 * fact_score + 0.5 * pattern_score

final_score = max(0.0, min(1.0,
0.25 * effectiveness_delta
+ 0.20 * correctness_with
+ 0.15 * completeness_with
+ 0.15 * guideline_adherence_score
+ 0.10 * assertion_coverage
+ 0.05 * execution_success
+ 0.05 * token_efficiency
- 0.05 * regression_penalty
))
final_score = max(
0.0,
min(
1.0,
0.25 * effectiveness_delta
+ 0.20 * correctness_with
+ 0.15 * completeness_with
+ 0.15 * guideline_adherence_score
+ 0.10 * assertion_coverage
+ 0.05 * execution_success
+ 0.05 * token_efficiency
- 0.05 * regression_penalty,
),
)

# Build rich side_info for GEPA reflection
side_info: dict[str, Any] = {}
Expand Down
13 changes: 3 additions & 10 deletions .test/src/skill_test/optimize/eval_criteria.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,7 @@ def filter_by_modules(self, tool_modules: list[str]) -> "EvalCriteriaSet":

Criteria with empty ``applies_to`` are always included (general-purpose).
"""
filtered = [
s
for s in self.skills
if not s.applies_to or any(m in s.applies_to for m in tool_modules)
]
filtered = [s for s in self.skills if not s.applies_to or any(m in s.applies_to for m in tool_modules)]
result = EvalCriteriaSet.__new__(EvalCriteriaSet)
result.skills = filtered
result._by_name = {s.name: s for s in filtered}
Expand Down Expand Up @@ -134,8 +130,7 @@ def _to_markdown(self) -> str:
lines.append(f"- **{s.name}**: {s.description}")
lines.append("")
lines.append(
"Use the read_eval_criteria tool to load relevant criteria. "
"Use read_eval_reference for detailed rubrics."
"Use the read_eval_criteria tool to load relevant criteria. Use read_eval_reference for detailed rubrics."
)
return "\n".join(lines)

Expand Down Expand Up @@ -184,9 +179,7 @@ def discover_eval_criteria(
if not base.is_dir():
logger.debug("Eval criteria directory not found: %s", base)
return EvalCriteriaSet([])
paths = sorted(
d for d in base.iterdir() if d.is_dir() and (d / "SKILL.md").exists()
)
paths = sorted(d for d in base.iterdir() if d.is_dir() and (d / "SKILL.md").exists())
if paths:
logger.info(
"Discovered %d eval criteria: %s",
Expand Down
43 changes: 13 additions & 30 deletions .test/src/skill_test/optimize/judges.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def _is_rate_limit_error(exc: Exception) -> bool:
# AI Gateway support
# ---------------------------------------------------------------------------


def _get_gateway_base_url() -> str | None:
"""Return the AI Gateway base URL if configured, else None.

Expand Down Expand Up @@ -645,7 +646,11 @@ def _call_judge(j):
fb = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
fb_pool.shutdown(wait=False)
logger.warning("Fallback '%s' timed out after %ds, trying next", fallback_model, timeout)
logger.warning(
"Fallback '%s' timed out after %ds, trying next",
fallback_model,
timeout,
)
continue
finally:
fb_pool.shutdown(wait=False)
Expand Down Expand Up @@ -803,25 +808,15 @@ def create_trace_correctness_judge(
CLI flag, ``GEPA_JUDGE_LM`` env var, or default.
"""
criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else ""
instructions = (
_TRACE_CORRECTNESS_INSTRUCTIONS_PREFIX
+ criteria_block
+ _TRACE_CORRECTNESS_INSTRUCTIONS_BODY
)
instructions = _TRACE_CORRECTNESS_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_CORRECTNESS_INSTRUCTIONS_BODY

if skill_guidelines:
filtered = [
g
for g in skill_guidelines
if any(kw in g.lower() for kw in _CORRECTNESS_KEYWORDS)
]
filtered = [g for g in skill_guidelines if any(kw in g.lower() for kw in _CORRECTNESS_KEYWORDS)]
if filtered:
principles = "\n".join(f"- {g}" for g in filtered)
instructions += f"\n\n## Domain Correctness Principles\n{principles}\n"

model_uri, inference_params = _to_judge_model_and_params(
judge_model or DEFAULT_JUDGE_LM
)
model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM)
return make_judge(
name="trace_correctness",
model=model_uri,
Expand All @@ -844,15 +839,9 @@ def create_trace_completeness_judge(
judge_model: LLM model for the judge.
"""
criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else ""
instructions = (
_TRACE_COMPLETENESS_INSTRUCTIONS_PREFIX
+ criteria_block
+ _TRACE_COMPLETENESS_INSTRUCTIONS_BODY
)
instructions = _TRACE_COMPLETENESS_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_COMPLETENESS_INSTRUCTIONS_BODY

model_uri, inference_params = _to_judge_model_and_params(
judge_model or DEFAULT_JUDGE_LM
)
model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM)
return make_judge(
name="trace_completeness",
model=model_uri,
Expand All @@ -879,19 +868,13 @@ def create_trace_guideline_judge(
judge_model: LLM model for the judge.
"""
criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else ""
instructions = (
_TRACE_GUIDELINE_INSTRUCTIONS_PREFIX
+ criteria_block
+ _TRACE_GUIDELINE_INSTRUCTIONS_BODY
)
instructions = _TRACE_GUIDELINE_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_GUIDELINE_INSTRUCTIONS_BODY

if skill_guidelines:
principles = "\n".join(f"- {g}" for g in skill_guidelines)
instructions += f"\n\n## Required Guidelines\n{principles}\n"

model_uri, inference_params = _to_judge_model_and_params(
judge_model or DEFAULT_JUDGE_LM
)
model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM)
return make_judge(
name="trace_guideline_adherence",
model=model_uri,
Expand Down
Loading