diff --git a/.test/src/skill_test/agent/executor.py b/.test/src/skill_test/agent/executor.py index 7726d6af..283174f4 100644 --- a/.test/src/skill_test/agent/executor.py +++ b/.test/src/skill_test/agent/executor.py @@ -197,8 +197,7 @@ def _load_mcp_config() -> dict[str, Any]: resolved_cfg[key] = val.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root)) elif isinstance(val, list): resolved_cfg[key] = [ - v.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root)) if isinstance(v, str) else v - for v in val + v.replace("${CLAUDE_PLUGIN_ROOT}", str(repo_root)) if isinstance(v, str) else v for v in val ] else: resolved_cfg[key] = val @@ -283,7 +282,11 @@ def _get_agent_env() -> dict[str, str]: # 2. Env vars with known prefixes override settings file values # Skip internal Claude Code vars that would confuse the subprocess - _skip_keys = {"CLAUDE_CODE_SSE_PORT", "CLAUDE_CODE_ENTRYPOINT", "CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY"} + _skip_keys = { + "CLAUDE_CODE_SSE_PORT", + "CLAUDE_CODE_ENTRYPOINT", + "CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY", + } for key, value in os.environ.items(): if key in _skip_keys: continue @@ -510,7 +513,7 @@ async def run_agent( # Pass Databricks auth env vars to MCP server processes if mcp_config: mcp_env = {k: v for k, v in env.items() if k.startswith(("DATABRICKS_",))} - for server_name, server_cfg in mcp_config.items(): + for _server_name, server_cfg in mcp_config.items(): if "env" not in server_cfg and mcp_env: server_cfg["env"] = mcp_env diff --git a/.test/src/skill_test/optimize/agent_evaluator.py b/.test/src/skill_test/optimize/agent_evaluator.py index 5cfa4feb..2651a4ab 100644 --- a/.test/src/skill_test/optimize/agent_evaluator.py +++ b/.test/src/skill_test/optimize/agent_evaluator.py @@ -189,15 +189,9 @@ def __init__( ) # --- Field-based judges (fallback — when mlflow_trace is None) --- - self._field_correctness_judge = create_correctness_judge( - skill_guidelines, judge_model=judge_model - ) - self._field_completeness_judge = create_completeness_judge( - judge_model=judge_model - ) - self._field_guideline_judge = create_guideline_adherence_judge( - skill_guidelines, judge_model=judge_model - ) + self._field_correctness_judge = create_correctness_judge(skill_guidelines, judge_model=judge_model) + self._field_completeness_judge = create_completeness_judge(judge_model=judge_model) + self._field_guideline_judge = create_guideline_adherence_judge(skill_guidelines, judge_model=judge_model) self._regression_judge = create_regression_judge(judge_model=judge_model) @@ -300,8 +294,7 @@ def _evaluate( facts_str = "\n".join(f"- {f}" for f in facts) if facts else "None specified" patterns_str = ( "\n".join( - f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" - for p in patterns + f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" for p in patterns ) if patterns else "None specified" @@ -321,8 +314,12 @@ def _evaluate( # Circuit breaker: after first failure, skip trace judges entirely # to avoid wasting API calls on a model that can't handle them. def _judge_with_fallback( - trace_judge, field_judge, *, - mlflow_trace, response_text, judge_name, + trace_judge, + field_judge, + *, + mlflow_trace, + response_text, + judge_name, ) -> JudgeFeedback: """Try trace-based judge, fall back to field-based on failure.""" with self._cache_lock: @@ -357,7 +354,8 @@ def _judge_with_fallback( # Correctness: WITH + WITHOUT (WITHOUT cached) correctness_with_fb = _judge_with_fallback( - self._trace_correctness_judge, self._field_correctness_judge, + self._trace_correctness_judge, + self._field_correctness_judge, mlflow_trace=with_result.mlflow_trace, response_text=with_response, judge_name="correctness_with", @@ -366,7 +364,8 @@ def _judge_with_fallback( need_correctness_baseline = baseline_key not in self._baseline_correctness_cache if need_correctness_baseline: fb = _judge_with_fallback( - self._trace_correctness_judge, self._field_correctness_judge, + self._trace_correctness_judge, + self._field_correctness_judge, mlflow_trace=without_mlflow_trace, response_text=without_response, judge_name="correctness_without", @@ -379,7 +378,8 @@ def _judge_with_fallback( # Completeness: WITH + WITHOUT (WITHOUT cached) completeness_with_fb = _judge_with_fallback( - self._trace_completeness_judge, self._field_completeness_judge, + self._trace_completeness_judge, + self._field_completeness_judge, mlflow_trace=with_result.mlflow_trace, response_text=with_response, judge_name="completeness_with", @@ -388,7 +388,8 @@ def _judge_with_fallback( need_completeness_baseline = baseline_key not in self._baseline_completeness_cache if need_completeness_baseline: fb = _judge_with_fallback( - self._trace_completeness_judge, self._field_completeness_judge, + self._trace_completeness_judge, + self._field_completeness_judge, mlflow_trace=without_mlflow_trace, response_text=without_response, judge_name="completeness_without", @@ -401,7 +402,8 @@ def _judge_with_fallback( # Guideline adherence: WITH only guideline_adherence_fb = _judge_with_fallback( - self._trace_guideline_judge, self._field_guideline_judge, + self._trace_guideline_judge, + self._field_guideline_judge, mlflow_trace=with_result.mlflow_trace, response_text=with_response, judge_name="guideline_adherence", @@ -444,7 +446,10 @@ def _judge_with_fallback( reg_val = regression_fb.value if isinstance(reg_val, bool): regression_penalty = 1.0 if reg_val else 0.0 - elif isinstance(reg_val, str) and reg_val.strip().lower() in ("yes", "true"): + elif isinstance(reg_val, str) and reg_val.strip().lower() in ( + "yes", + "true", + ): regression_penalty = 1.0 # Phase 4: Deterministic fact/pattern assertions (zero LLM cost — static spine) @@ -481,16 +486,20 @@ def _judge_with_fallback( quality_composite = (correctness_with + completeness_with + guideline_adherence_score) / 3.0 assertion_coverage = 0.5 * fact_score + 0.5 * pattern_score - final_score = max(0.0, min(1.0, - 0.25 * effectiveness_delta - + 0.20 * correctness_with - + 0.15 * completeness_with - + 0.15 * guideline_adherence_score - + 0.10 * assertion_coverage - + 0.05 * execution_success - + 0.05 * token_efficiency - - 0.05 * regression_penalty - )) + final_score = max( + 0.0, + min( + 1.0, + 0.25 * effectiveness_delta + + 0.20 * correctness_with + + 0.15 * completeness_with + + 0.15 * guideline_adherence_score + + 0.10 * assertion_coverage + + 0.05 * execution_success + + 0.05 * token_efficiency + - 0.05 * regression_penalty, + ), + ) # Build rich side_info for GEPA reflection side_info: dict[str, Any] = {} diff --git a/.test/src/skill_test/optimize/eval_criteria.py b/.test/src/skill_test/optimize/eval_criteria.py index 050d33ad..04242501 100644 --- a/.test/src/skill_test/optimize/eval_criteria.py +++ b/.test/src/skill_test/optimize/eval_criteria.py @@ -92,11 +92,7 @@ def filter_by_modules(self, tool_modules: list[str]) -> "EvalCriteriaSet": Criteria with empty ``applies_to`` are always included (general-purpose). """ - filtered = [ - s - for s in self.skills - if not s.applies_to or any(m in s.applies_to for m in tool_modules) - ] + filtered = [s for s in self.skills if not s.applies_to or any(m in s.applies_to for m in tool_modules)] result = EvalCriteriaSet.__new__(EvalCriteriaSet) result.skills = filtered result._by_name = {s.name: s for s in filtered} @@ -134,8 +130,7 @@ def _to_markdown(self) -> str: lines.append(f"- **{s.name}**: {s.description}") lines.append("") lines.append( - "Use the read_eval_criteria tool to load relevant criteria. " - "Use read_eval_reference for detailed rubrics." + "Use the read_eval_criteria tool to load relevant criteria. Use read_eval_reference for detailed rubrics." ) return "\n".join(lines) @@ -184,9 +179,7 @@ def discover_eval_criteria( if not base.is_dir(): logger.debug("Eval criteria directory not found: %s", base) return EvalCriteriaSet([]) - paths = sorted( - d for d in base.iterdir() if d.is_dir() and (d / "SKILL.md").exists() - ) + paths = sorted(d for d in base.iterdir() if d.is_dir() and (d / "SKILL.md").exists()) if paths: logger.info( "Discovered %d eval criteria: %s", diff --git a/.test/src/skill_test/optimize/judges.py b/.test/src/skill_test/optimize/judges.py index 18940d28..019bc16b 100644 --- a/.test/src/skill_test/optimize/judges.py +++ b/.test/src/skill_test/optimize/judges.py @@ -87,6 +87,7 @@ def _is_rate_limit_error(exc: Exception) -> bool: # AI Gateway support # --------------------------------------------------------------------------- + def _get_gateway_base_url() -> str | None: """Return the AI Gateway base URL if configured, else None. @@ -645,7 +646,11 @@ def _call_judge(j): fb = future.result(timeout=timeout) except concurrent.futures.TimeoutError: fb_pool.shutdown(wait=False) - logger.warning("Fallback '%s' timed out after %ds, trying next", fallback_model, timeout) + logger.warning( + "Fallback '%s' timed out after %ds, trying next", + fallback_model, + timeout, + ) continue finally: fb_pool.shutdown(wait=False) @@ -803,25 +808,15 @@ def create_trace_correctness_judge( CLI flag, ``GEPA_JUDGE_LM`` env var, or default. """ criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else "" - instructions = ( - _TRACE_CORRECTNESS_INSTRUCTIONS_PREFIX - + criteria_block - + _TRACE_CORRECTNESS_INSTRUCTIONS_BODY - ) + instructions = _TRACE_CORRECTNESS_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_CORRECTNESS_INSTRUCTIONS_BODY if skill_guidelines: - filtered = [ - g - for g in skill_guidelines - if any(kw in g.lower() for kw in _CORRECTNESS_KEYWORDS) - ] + filtered = [g for g in skill_guidelines if any(kw in g.lower() for kw in _CORRECTNESS_KEYWORDS)] if filtered: principles = "\n".join(f"- {g}" for g in filtered) instructions += f"\n\n## Domain Correctness Principles\n{principles}\n" - model_uri, inference_params = _to_judge_model_and_params( - judge_model or DEFAULT_JUDGE_LM - ) + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="trace_correctness", model=model_uri, @@ -844,15 +839,9 @@ def create_trace_completeness_judge( judge_model: LLM model for the judge. """ criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else "" - instructions = ( - _TRACE_COMPLETENESS_INSTRUCTIONS_PREFIX - + criteria_block - + _TRACE_COMPLETENESS_INSTRUCTIONS_BODY - ) + instructions = _TRACE_COMPLETENESS_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_COMPLETENESS_INSTRUCTIONS_BODY - model_uri, inference_params = _to_judge_model_and_params( - judge_model or DEFAULT_JUDGE_LM - ) + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="trace_completeness", model=model_uri, @@ -879,19 +868,13 @@ def create_trace_guideline_judge( judge_model: LLM model for the judge. """ criteria_block = eval_criteria.to_prompt(judge_model) if eval_criteria else "" - instructions = ( - _TRACE_GUIDELINE_INSTRUCTIONS_PREFIX - + criteria_block - + _TRACE_GUIDELINE_INSTRUCTIONS_BODY - ) + instructions = _TRACE_GUIDELINE_INSTRUCTIONS_PREFIX + criteria_block + _TRACE_GUIDELINE_INSTRUCTIONS_BODY if skill_guidelines: principles = "\n".join(f"- {g}" for g in skill_guidelines) instructions += f"\n\n## Required Guidelines\n{principles}\n" - model_uri, inference_params = _to_judge_model_and_params( - judge_model or DEFAULT_JUDGE_LM - ) + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="trace_guideline_adherence", model=model_uri, diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 223e63cb..de97ad48 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -164,13 +164,20 @@ def _eval_task(idx, inst, task_id): idx = futures[future] task_id = tasks[idx].get("id", f"task_{idx}") inst = gepa_instances[idx] - score, side_info = 0.0, {"_error": str(e), "scores": {"final": 0.0}} + score, side_info = ( + 0.0, + {"_error": str(e), "scores": {"final": 0.0}}, + ) logger.warning("Evaluator failed for task %s: %s", task_id, e) per_task[task_id] = score side_info_by_id[task_id] = side_info side_info_by_input[inst.get("input", f"task_{idx}")] = side_info completed += 1 - print(f"\r {label}: {completed}/{total} ({task_id})...", end="", flush=True) + print( + f"\r {label}: {completed}/{total} ({task_id})...", + end="", + flush=True, + ) except TimeoutError: # as_completed timeout — score remaining tasks as 0.0 for future, idx in futures.items(): @@ -179,7 +186,11 @@ def _eval_task(idx, inst, task_id): inst = gepa_instances[idx] per_task.setdefault(task_id, 0.0) side_info_by_id.setdefault( - task_id, {"_error": "as_completed timeout (900s)", "scores": {"final": 0.0}} + task_id, + { + "_error": "as_completed timeout (900s)", + "scores": {"final": 0.0}, + }, ) side_info_by_input.setdefault(inst.get("input", f"task_{idx}"), side_info_by_id[task_id]) future.cancel() @@ -535,6 +546,7 @@ def optimize_skill( if _manifest_path.exists(): try: import yaml as _yaml + _manifest_data = _yaml.safe_load(_manifest_path.read_text()) or {} _manifest_tool_modules = _manifest_data.get("tool_modules") except Exception: @@ -643,7 +655,10 @@ def _refiner_lm_with_fallback(prompt): print(f"\nScoring baseline ({len(train)} tasks, ~5 LLM calls each)...") original_score, original_per_task, si_by_id, _ = _evaluate_on_tasks( - evaluator, seed_candidate, train, label="Baseline", + evaluator, + seed_candidate, + train, + label="Baseline", max_parallel=_eval_max_parallel, ) print(f"Current score: {original_score:.3f}") @@ -668,7 +683,10 @@ def _refiner_lm_with_fallback(prompt): if agent_evaluator: print(f"\nAgent baseline ({len(train)} tasks)...") dry_run_agent_score, agent_per_task, dry_run_agent_si, _ = _evaluate_on_tasks( - agent_evaluator, seed_candidate, train, label="Agent baseline", + agent_evaluator, + seed_candidate, + train, + label="Agent baseline", max_parallel=parallel_agents, ) print(f"Agent baseline score: {dry_run_agent_score:.3f}") @@ -704,13 +722,17 @@ def _refiner_lm_with_fallback(prompt): _eval_desc = "2 agent runs + judges" if agent_eval_full else "~5 LLM calls" print(f"\nScoring {_eval_label.lower()} ({len(train)} tasks, {_eval_desc} each)...") original_score, original_per_task, si_by_id, si_by_input = _evaluate_on_tasks( - evaluator, seed_candidate, train, label=_eval_label, + evaluator, + seed_candidate, + train, + label=_eval_label, max_parallel=_eval_max_parallel, ) # 6. Build background and objective if agent_eval_full: from .agent_evaluator import build_agent_eval_background + background = build_agent_eval_background( skill_name, total_original_tokens, @@ -764,7 +786,10 @@ def _refiner_lm_with_fallback(prompt): if agent_evaluator and not agent_eval_full: print(f"\n Agent baseline scoring ({len(train)} tasks)...") agent_baseline_score, agent_baseline_per_task, agent_baseline_si, _ = _evaluate_on_tasks( - agent_evaluator, seed_candidate, train, label="Agent baseline", + agent_evaluator, + seed_candidate, + train, + label="Agent baseline", max_parallel=parallel_agents, ) print(f" Agent baseline score: {agent_baseline_score:.3f}") @@ -826,7 +851,10 @@ def _refiner_lm_with_fallback(prompt): candidate = result.best_candidate pass_score, _, pass_si_by_id, _ = _evaluate_on_tasks( - evaluator, candidate, train, label=f"Pass {pass_num}", + evaluator, + candidate, + train, + label=f"Pass {pass_num}", max_parallel=_eval_max_parallel, ) improvement = pass_score - best_score @@ -859,7 +887,10 @@ def _refiner_lm_with_fallback(prompt): val_scores: dict[str, float] = {} if val: _, val_scores, _, _ = _evaluate_on_tasks( - evaluator, best, val, label="Validation", + evaluator, + best, + val, + label="Validation", max_parallel=_eval_max_parallel, ) @@ -878,7 +909,10 @@ def _refiner_lm_with_fallback(prompt): if agent_evaluator and not agent_eval_full: print(f"\n Agent validation scoring ({len(train)} tasks on best candidate)...") agent_validation_score, agent_val_per_task, agent_validation_si, _ = _evaluate_on_tasks( - agent_evaluator, best, train, label="Agent validation", + agent_evaluator, + best, + train, + label="Agent validation", max_parallel=parallel_agents, ) print(f" Agent validation score: {agent_validation_score:.3f}") diff --git a/databricks-mcp-server/tests/test_sql_output_format.py b/databricks-mcp-server/tests/test_sql_output_format.py index 5dadb5de..9b678abd 100644 --- a/databricks-mcp-server/tests/test_sql_output_format.py +++ b/databricks-mcp-server/tests/test_sql_output_format.py @@ -73,6 +73,4 @@ def test_markdown_smaller_than_json(self): md = _format_results_markdown(rows) js = json.dumps(rows) # Markdown should be at least 30% smaller - assert len(md) < len(js) * 0.7, ( - f"Markdown ({len(md)} chars) should be <70% of JSON ({len(js)} chars)" - ) + assert len(md) < len(js) * 0.7, f"Markdown ({len(md)} chars) should be <70% of JSON ({len(js)} chars)" diff --git a/install.sh b/install.sh index 1927b4ac..32953478 100755 --- a/install.sh +++ b/install.sh @@ -1086,7 +1086,7 @@ install_skills() { # Determine target directories (array so paths with spaces work) for tool in $TOOLS; do case $tool in - claude) dirs=("$base_dir/.claude/skills") ;; + claude) dirs+=("$base_dir/.claude/skills") ;; cursor) echo "$TOOLS" | grep -q claude || dirs+=("$base_dir/.cursor/skills") ;; copilot) dirs+=("$base_dir/.github/skills") ;; codex) dirs+=("$base_dir/.agents/skills") ;;