diff --git a/anton/chat.py b/anton/chat.py index 5ce72fd7..9cd2ef67 100644 --- a/anton/chat.py +++ b/anton/chat.py @@ -1271,6 +1271,7 @@ async def _chat_loop( history_store=history_store, session_id=current_session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, output_dir=settings.artifacts_dir, tools=[CONNECT_DATASOURCE_TOOL, PUBLISH_TOOL], web_search_enabled=settings.web_search_enabled, diff --git a/anton/chat_session.py b/anton/chat_session.py index 0f7ac11a..200bd641 100644 --- a/anton/chat_session.py +++ b/anton/chat_session.py @@ -116,6 +116,7 @@ def rebuild_session( history_store=history_store, session_id=session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, output_dir=settings.artifacts_dir, web_search_enabled=settings.web_search_enabled, web_fetch_enabled=settings.web_fetch_enabled, diff --git a/anton/config/settings.py b/anton/config/settings.py index 31257809..fc930403 100644 --- a/anton/config/settings.py +++ b/anton/config/settings.py @@ -85,6 +85,10 @@ class AntonSettings(CoreSettings): proactive_dashboards: bool = False # when True, build HTML dashboards; when False, CLI output only + # "Do first, ask later": act on reasonable defaults and surface assumptions + # inline instead of stopping to ask. False = cautious ask-first discipline. + act_first: bool = True + theme: str = "auto" disable_autoupdates: bool = False diff --git a/anton/core/backends/utils.py b/anton/core/backends/utils.py index 07cd1796..0d87b8ff 100644 --- a/anton/core/backends/utils.py +++ b/anton/core/backends/utils.py @@ -8,7 +8,23 @@ def compute_timeouts(estimated_seconds: int) -> tuple[float, float]: """ s = CoreSettings() if estimated_seconds <= 0: - return float(s.cell_timeout_default), float(s.cell_inactivity_timeout) - total = max(estimated_seconds * 2, estimated_seconds + 30) - inactivity = max(estimated_seconds * 0.5, 30) - return float(total), float(inactivity) \ No newline at end of file + total = float(s.cell_timeout_default) + inactivity = float(s.cell_inactivity_timeout) + else: + total = float(max(estimated_seconds * 2, estimated_seconds + 30)) + inactivity = float(max(estimated_seconds * 0.5, 30)) + # Clamp the silence window: a large estimate must not buy minutes of + # undetected silence (an est=600 cell would otherwise allow 300s of no + # output before being killed). A cell quiet for cell_inactivity_max + # seconds is killed regardless of its estimate. stdout/progress() reset + # this window, so legitimate long-but-active cells — e.g. a batch loop + # pinging progress() — are unaffected; only genuinely stuck cells die. + inactivity = min(inactivity, float(s.cell_inactivity_max)) + # The total is deliberately left scaling so long-but-active cells run to + # completion. cell_total_max (default 0 = off) is an optional absolute + # backstop for a runaway that keeps producing output forever (which the + # inactivity cap can't catch); set it only when that risk outweighs + # clipping a genuinely long batch job. + if s.cell_total_max > 0: + total = min(total, float(s.cell_total_max)) + return total, inactivity \ No newline at end of file diff --git a/anton/core/dispatch/local_runtime.py b/anton/core/dispatch/local_runtime.py index 5430f056..63e8d63a 100644 --- a/anton/core/dispatch/local_runtime.py +++ b/anton/core/dispatch/local_runtime.py @@ -442,6 +442,12 @@ def _safe_error_message(exc: Exception) -> str: """Render an exception as a user-facing error with API keys redacted.""" try: from anton.core.runtime import safe_redact_error + from anton.core.llm.provider import TokenLimitExceeded + # A spent token allowance isn't a crash — surface anton's + # already-friendly quota message as-is, without the + # `[agent error]` prefix that reads like something broke. + if isinstance(exc, TokenLimitExceeded): + return safe_redact_error(exc) return f"[agent error] {safe_redact_error(exc)}" except Exception: return f"[agent error] {exc!r}" diff --git a/anton/core/llm/anthropic.py b/anton/core/llm/anthropic.py index cc284d2d..7ca41555 100644 --- a/anton/core/llm/anthropic.py +++ b/anton/core/llm/anthropic.py @@ -127,7 +127,7 @@ async def complete( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -274,7 +274,7 @@ async def stream( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc diff --git a/anton/core/llm/openai.py b/anton/core/llm/openai.py index 89064e07..c2f0549c 100644 --- a/anton/core/llm/openai.py +++ b/anton/core/llm/openai.py @@ -683,7 +683,7 @@ async def complete( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -852,7 +852,7 @@ async def stream( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -970,7 +970,7 @@ async def _complete_via_responses( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -1099,7 +1099,7 @@ async def _stream_via_responses( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc diff --git a/anton/core/llm/prompt_builder.py b/anton/core/llm/prompt_builder.py index 9d50a80c..9a4f0c0e 100644 --- a/anton/core/llm/prompt_builder.py +++ b/anton/core/llm/prompt_builder.py @@ -8,6 +8,8 @@ BASE_VISUALIZATIONS_PROMPT, BACKEND_GENERATION_PROMPT, CHAT_SYSTEM_PROMPT, + CONVERSATION_DISCIPLINE_ACT_FIRST, + CONVERSATION_DISCIPLINE_ASK_FIRST, VISUALIZATIONS_MARKDOWN_OUTPUT_FORMAT_PROMPT, VISUALIZATIONS_HTML_OUTPUT_FORMAT_PROMPT, ) @@ -124,10 +126,12 @@ def _build_visualizations_section( def build( self, *, + conversation_started: str, current_datetime: str, system_prompt_context: SystemPromptContext, proactive_dashboards: bool, output_dir: str, + act_first: bool = True, tool_defs: list["ToolDef"] | None = None, memory_context: str = "", project_context: str = "", @@ -146,11 +150,17 @@ def build( if prefix: prompt += f"{prefix}\n\n" + conversation_discipline = ( + CONVERSATION_DISCIPLINE_ACT_FIRST if act_first + else CONVERSATION_DISCIPLINE_ASK_FIRST + ) + prompt += CHAT_SYSTEM_PROMPT.format( runtime_context=system_prompt_context.runtime_context, artifacts_section=ARTIFACTS_PROMPT, visualizations_section=visualizations_section, - current_datetime=current_datetime, + conversation_discipline=conversation_discipline, + conversation_started=conversation_started, ) prompt += "\n\n" + BACKEND_GENERATION_PROMPT.format(output_dir=output_dir) @@ -159,8 +169,8 @@ def build( if tool_prompts: prompt += tool_prompts - if memory_context: - prompt += memory_context + # Stable, per-session content goes before the volatile tail so the + # prefix stays cache-stable across turns. if project_context: prompt += project_context if self_awareness_context: @@ -176,6 +186,18 @@ def build( if suffix: prompt += f"\n\n{suffix}" + # Volatile tail — LAST so everything above can be cached. The live + # clock and the relevance-filtered memory snapshot both change every + # turn, so they sit after the cache-stable prefix and never invalidate + # it. (The prefix carries only the fixed "conversation started" stamp.) + prompt += ( + f"\n\nCurrent date and time: {current_datetime}\n" + "(Earlier messages are prefixed with the time they were sent; that " + "bracketed timestamp is metadata, not part of the message text.)" + ) + if memory_context: + prompt += memory_context + return prompt diff --git a/anton/core/llm/prompts.py b/anton/core/llm/prompts.py index 245a48c1..527642f2 100644 --- a/anton/core/llm/prompts.py +++ b/anton/core/llm/prompts.py @@ -7,7 +7,7 @@ solve problems. You are NOT a code assistant or chatbot. You are a coworker with a \ computer, and you use that computer to get things done. -Current date and time: {current_datetime} +Conversation started: {conversation_started} WHO YOU ARE: - You solve problems — not just write code. If someone needs emails classified, data \ @@ -160,15 +160,7 @@ {visualizations_section} -CONVERSATION DISCIPLINE (critical): -- If you ask the user a question, STOP and WAIT for their reply. Never ask a question \ -and then act in the same turn — that skips the user's answer. -- Only act when you have ALL the information you need. If you're unsure \ -about anything, ask first, then act in a LATER turn after receiving the answer. -- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ -it in context of what you just asked. Do not ask them to repeat themselves. -- Gather requirements incrementally through conversation. Do not front-load every \ -possible question at once — ask 1-3 at a time, then follow up. +{conversation_discipline} RUNTIME IDENTITY: {runtime_context} @@ -185,6 +177,8 @@ different data sources for the same information, caching/retrying with backoff, etc. - Exhaust at least 2-3 genuinely different approaches before involving the user. Each \ attempt should be a meaningfully different strategy — not just retrying the same thing. +- If a scratchpad cell errors the same way twice, change strategy — don't re-run the \ +same code expecting a different result. - Only ask the user for things that truly require them: credentials they haven't shared, \ ambiguous requirements you can't infer, access to private/internal systems, or a choice \ between equally valid options. @@ -192,6 +186,9 @@ so the user has full context and doesn't suggest things you've already done. GENERAL RULES: +- Validate your output before claiming the task is done — actually check the result \ +(inspect the data, run it, confirm the file/artifact exists and looks right) instead of \ +assuming it worked. Report what you verified, not what you intended. - Be conversational, concise, and direct. No filler. No bullet-point dumps unless asked. - Respond naturally to greetings, small talk, and follow-up questions. - When describing yourself, focus on problem-solving and collaboration — not listing \ @@ -210,6 +207,44 @@ Only encode genuinely reusable knowledge — not transient conversation details. """ +# --------------------------------------------------------------------------- +# Conversation discipline — two postures, selected by the `act_first` flag +# (ChatSessionConfig.act_first → AntonSettings.act_first; default True). +# Injected into CHAT_SYSTEM_PROMPT via {conversation_discipline}. +# --------------------------------------------------------------------------- +CONVERSATION_DISCIPLINE_ACT_FIRST = """CONVERSATION DISCIPLINE (critical): +- Bias toward ACTION. When a request has a reasonable default interpretation, act on it \ +now — do not stall the task with a clarifying question. A delivered result the user can \ +correct beats a question that makes them wait. +- STATE YOUR ASSUMPTIONS AS YOU MAKE THEM. Whenever you proceed on an assumption — a \ +default value, an interpretation of a vague request, a chosen approach, or a scope you \ +picked — say so plainly in the SAME response, right as you act, not buried at the end. \ +Phrase it like "Assuming you mean X (the common case), so I'll…" or "Going with monthly \ +granularity since you didn't specify." Surface each assumption as it happens so the user \ +can redirect mid-flight instead of being blocked up front. Acting silently is wrong; \ +acting out loud with your assumptions visible is right. +- Only STOP and ASK when acting on a guess would be costly to undo or is genuinely \ +unknowable: destructive or irreversible actions (deleting data, spending money, sending \ +messages on the user's behalf), credentials or access you can't obtain, or a fork where \ +the options lead to materially different results and you have no basis to choose. Then ask \ +ONE tight question — and when you ask, STOP and WAIT for the reply; never ask and act in \ +the same turn, that skips their answer. +- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ +it in context of what you just asked. Do not ask them to repeat themselves. +- Don't front-load a questionnaire. Prefer acting on sensible defaults (stated out loud) \ +over interrogating the user; if something truly gates the work, ask at most 1-2 things.""" + +CONVERSATION_DISCIPLINE_ASK_FIRST = """CONVERSATION DISCIPLINE (critical): +- If you ask the user a question, STOP and WAIT for their reply. Never ask a question \ +and then act in the same turn — that skips the user's answer. +- Only act when you have ALL the information you need. If you're unsure \ +about anything, ask first, then act in a LATER turn after receiving the answer. +- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ +it in context of what you just asked. Do not ask them to repeat themselves. +- Gather requirements incrementally through conversation. Do not front-load every \ +possible question at once — ask 1-3 at a time, then follow up.""" + + # --------------------------------------------------------------------------- # Artifact contract — universal entry point for any user-facing output # --------------------------------------------------------------------------- @@ -322,8 +357,8 @@ Do NOT build a single 20KB+ HTML string in memory and write it at the end. 3. CAP STRING SIZE PER CELL at ~5KB. Large-string scratchpad calls are the \ single biggest cause of silent failures (the tool occasionally drops the \ -`code` payload on oversized inputs and returns "No code provided", which still \ -counts against the round cap). If a section is too big, split it. +`code` payload on oversized inputs and the cell comes back with an empty-code \ +error, which still counts against the round cap). If a section is too big, split it. 4. NEVER re-emit the full HTML mid-build. Append deltas, don't re-print \ the world. Assembly is a one-line concat at the end, not a re-render of \ everything you've written so far. @@ -810,3 +845,22 @@ async def hello(): "a public API, archive.org, an alternate library, or a completely different data source. " "Only involve the user if the problem truly requires something only they can provide." ) + +# Scratchpad failures need different advice than the generic (scrape/fetch) +# RESILIENCE_NUDGE above — telling the model to "try a public API / archive.org" +# when a cell is too big or too slow just sends it renaming-and-retrying. These +# are chosen by failure type in ChatSession._apply_error_tracking. +SCRATCHPAD_SIZE_NUDGE = ( + "\n\nSYSTEM: This scratchpad cell keeps failing on its size, not its logic. " + "Stop retrying the same large cell. Write the output to disk incrementally — " + "open(path, 'w') once, then open(path, 'a') to append each chunk, keeping each " + "cell's string under ~5KB — or generate the content inside the cell instead of " + "passing a large literal. Reuse the SAME scratchpad; do not rename it." +) +SCRATCHPAD_TIMEOUT_NUDGE = ( + "\n\nSYSTEM: This scratchpad cell keeps timing out — the work is too heavy, not " + "the write. Make the next cell smaller: fewer rows/items per cell, split a long " + "loop across cells (process a batch, return, continue), or narrow the scope. Call " + "progress() inside long loops so active work isn't mistaken for a hang. Reuse the " + "SAME scratchpad; do not rename it." +) diff --git a/anton/core/memory/acc.py b/anton/core/memory/acc.py index 17c35e72..61b88cb1 100644 --- a/anton/core/memory/acc.py +++ b/anton/core/memory/acc.py @@ -446,26 +446,33 @@ def detect_reset_churn(events: Sequence[Event]) -> Lesson | None: def detect_kill_loop(events: Sequence[Event]) -> Lesson | None: - """The same scratchpad name had >= N cells killed (timeout/cancel/OOM). + """>= N scratchpad cells were killed (timeout/cancel/OOM) in one turn. + + Fires when a single scratchpad is killed >= N times (a per-pad loop) OR + when >= N cells are killed across the turn regardless of name. The + name-agnostic count is deliberate: renaming the scratchpad between failed + attempts (`build_pres` → `write_html` → …) used to split the kill count + across buckets and hide the loop. A kill is a kill, and the right lesson + (make the next cell smaller) is the same either way. Reads `kind == "scratchpad_killed"`; looks at `detail.name`. """ + killed = [e for e in events if e.kind == "scratchpad_killed"] by_name: defaultdict[str, int] = defaultdict(int) - for e in events: - if e.kind != "scratchpad_killed": - continue + for e in killed: n = e.detail.get("name") or "" if n: by_name[n] += 1 - if not by_name or max(by_name.values()) < _KILL_LOOP_THRESHOLD: + per_name_max = max(by_name.values()) if by_name else 0 + if per_name_max < _KILL_LOOP_THRESHOLD and len(killed) < _KILL_LOOP_THRESHOLD: return None return Lesson( rule=( - "When a scratchpad cell is killed (timeout, cancel, OOM), " - "the next cell on the same scratchpad needs to be smaller — " - "fewer rows, smaller batch, explicit timeout, narrower scope. " - "Two kills on the same scratchpad means the approach itself is " - "too heavy, not that the same cell needs another try." + "When a scratchpad cell is killed (timeout, cancel, OOM), the next " + "cell needs to be smaller — fewer rows, smaller batch, explicit " + "timeout, narrower scope — and stay on the SAME scratchpad. Two " + "kills in a turn (even across renamed scratchpads) mean the approach " + "is too heavy, not that the same cell needs another try." ), kind="when", triggers=("scratchpad_killed",), diff --git a/anton/core/runtime.py b/anton/core/runtime.py index 5f10f510..82783506 100644 --- a/anton/core/runtime.py +++ b/anton/core/runtime.py @@ -185,6 +185,7 @@ async def build_chat_session( history_store=history_store, session_id=session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, tools=list(extra_tools) if extra_tools else [], ) return ChatSession(config) diff --git a/anton/core/session.py b/anton/core/session.py index 58abf956..742ee377 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -3,6 +3,7 @@ import asyncio from collections.abc import AsyncIterator, Callable from dataclasses import asdict, dataclass, field +from datetime import datetime import json import re from typing import TYPE_CHECKING, List @@ -17,7 +18,11 @@ from anton.core.memory.cerebellum import Cerebellum from anton.core.memory.skills import SkillStore from anton.core.tools.recall_skill import RECALL_SKILL_TOOL -from anton.core.llm.prompts import RESILIENCE_NUDGE +from anton.core.llm.prompts import ( + RESILIENCE_NUDGE, + SCRATCHPAD_SIZE_NUDGE, + SCRATCHPAD_TIMEOUT_NUDGE, +) from anton.core.llm.provider import ( ContextOverflowError, StreamComplete, @@ -48,7 +53,11 @@ UPDATE_ARTIFACT_METADATA_TOOL, ToolDef, ) -from anton.core.utils.scratchpad import prepare_scratchpad_exec, format_cell_result +from anton.core.utils.scratchpad import ( + prepare_scratchpad_exec, + format_cell_result, + observe_scratchpad_cell, +) from anton.explainability import ExplainabilityCollector, ExplainabilityStore @@ -59,6 +68,11 @@ from anton.core.settings import CoreSettings +# Sentinel prefixing a compacted-history summary so later compactions can +# recognize and update it in place rather than summarize a summary. +_COMPACTED_MARKER = "[COMPACTED CONTEXT — REFERENCE ONLY]" + + if TYPE_CHECKING: from rich.console import Console from anton.context.self_awareness import SelfAwarenessContext @@ -112,6 +126,10 @@ class ChatSessionConfig: # host didn't identify itself. harness: str | None = None proactive_dashboards: bool = False + # When True (default), Anton acts on reasonable defaults and surfaces its + # assumptions inline instead of stopping to ask ("do first, ask later"). + # When False, it falls back to the cautious ask-first discipline. + act_first: bool = True tools: list[ToolDef] = field(default_factory=list) output_dir: str = ".anton/output" # Web tools — on by default. Each is independently resolved at session @@ -120,6 +138,13 @@ class ChatSessionConfig: # (registered on the tool registry). See ChatSession.__init__. web_search_enabled: bool = True web_fetch_enabled: bool = True + # When the task (conversation) was created. Rendered as a fixed + # "Conversation started: …" line in the cache-stable prompt prefix — it + # never changes across turns, so it doesn't bust the prefix cache. The + # LIVE current time goes in the volatile tail instead (see _build_system_prompt), + # so resuming a conversation days later still reports the real "now". + # None → fall back to today. + started_at: datetime | None = None class ChatSession: @@ -145,6 +170,8 @@ def __init__(self, config: ChatSessionConfig) -> None: self._system_prompt_context = config.system_prompt_context self._output_dir = config.output_dir self._proactive_dashboards = config.proactive_dashboards + self._act_first = config.act_first + self._started_at = config.started_at self._extra_tools = config.tools self._workspace = config.workspace self._data_vault = config.data_vault @@ -225,16 +252,17 @@ def _acc_has_similar(rule: str) -> bool: # turn. Mirrors ANTON_MEMORY_MODE for shape consistency: # "off" — ACC observes nothing (skipped at every emit site). # "passive" — Layer 1: lessons drain to memory at end-of-turn, - # next turn's system prompt picks them up. SAFE - # DEFAULT — adds no surface-area to the turn loop. - # "active" — Layer 2: ALSO inject lessons inline as text - # blocks in tool_results so the LLM sees them on - # the very next round. Stronger learning signal, - # but more invasive — the LLM has to handle the - # nudge gracefully without confusing it for a - # user instruction. - _mode_raw = os.environ.get("ANTON_ACC_MODE", "passive").strip().lower() - self._acc_mode = _mode_raw if _mode_raw in ("off", "passive", "active") else "passive" + # next turn's system prompt picks them up. No + # surface-area on the turn loop. + # "active" — Layer 2 (DEFAULT): ALSO inject lessons inline as + # text blocks in tool_results so the LLM sees them on + # the very next round and can self-correct mid-task. + # Stronger signal; the nudge is clearly labelled as an + # automatic self-check (not a user instruction). Set + # ANTON_ACC_MODE=passive to revert to learn-next-turn, + # or =off to disable, if it ever causes trouble. + _mode_raw = os.environ.get("ANTON_ACC_MODE", "active").strip().lower() + self._acc_mode = _mode_raw if _mode_raw in ("off", "passive", "active") else "active" # Scratchpad observers — list of objects with on_pre_execute / # on_post_execute. Fired by handle_scratchpad around pad.execute. # The runtime never sees this list; observation lives at the @@ -303,8 +331,10 @@ def _apply_error_tracking( streak = error_streak.get(tool_name, 0) if streak >= self._resilience_nudge_at and tool_name not in resilience_nudged: - result_text += RESILIENCE_NUDGE - resilience_nudged.add(tool_name) + nudge = self._select_resilience_nudge(tool_name, result_text) + if nudge: + result_text += nudge + resilience_nudged.add(tool_name) if streak >= self._max_consecutive_errors: result_text += ( @@ -315,6 +345,34 @@ def _apply_error_tracking( return result_text + @staticmethod + def _select_resilience_nudge(tool_name: str, result_text: str) -> str: + """Pick the right soft-nudge for a repeated failure. + + The generic RESILIENCE_NUDGE is scrape/fetch advice ("try a public + API / archive.org / different headers"). That actively misdirects a + scratchpad failure: a cell that's too big or too slow doesn't need a + different data source, it needs to be chunked or scoped down. Route + scratchpad failures to size/timeout-specific guidance by inspecting + the error text; a generic scratchpad error (e.g. a SyntaxError) and + every non-scratchpad tool keep the generic nudge. + """ + if tool_name != "scratchpad": + return RESILIENCE_NUDGE + low = result_text.lower() + if "timed out" in low or "inactivity" in low: + return SCRATCHPAD_TIMEOUT_NUDGE + # Match the empty-code dispatcher message specifically — generic + # phrases like "too large"/"truncated" appear in unrelated errors + # (e.g. a MySQL "Data truncated for column" warning) and would + # misfire the chunking advice. + if "argument was empty" in low: + return SCRATCHPAD_SIZE_NUDGE + # Other scratchpad failures (syntax/runtime errors): the generic + # "you've failed twice, change approach" nudge still applies — only + # the size/timeout cases get specialised advice. + return RESILIENCE_NUDGE + def repair_history(self) -> None: """Fix dangling tool_use blocks left by mid-stream cancellation. @@ -536,8 +594,16 @@ def _record_cell_explainability( async def _build_system_prompt(self, user_message: str = "") -> str: import datetime as _dt - _now = _dt.datetime.now() - _current_datetime = _now.strftime("%A, %B %d, %Y at %I:%M %p") + # Two stamps, deliberately split for cache-stability AND correctness: + # • conversation_started — the task's creation time (self._started_at), + # a FIXED fact rendered in the cache-stable prefix; identical every + # turn so it never busts the prefix cache. + # • current_datetime — the real wall clock, rendered in the VOLATILE + # tail (after the cached prefix) so it's always accurate even when a + # conversation is resumed days/weeks later, without touching the cache. + _started = self._started_at or _dt.datetime.now() + _conversation_started = _started.strftime("%A, %B %d, %Y") + _current_datetime = _dt.datetime.now().strftime("%A, %B %d, %Y at %I:%M %p") # Inject memory context (replaces old self_awareness) memory_section = "" @@ -562,9 +628,11 @@ async def _build_system_prompt(self, user_message: str = "") -> str: prompt_builder = ChatSystemPromptBuilder() prompt = prompt_builder.build( + conversation_started=_conversation_started, current_datetime=_current_datetime, system_prompt_context=self._system_prompt_context, proactive_dashboards=self._proactive_dashboards, + act_first=self._act_first, output_dir=self._output_dir, tool_defs=self.tool_registry.get_tool_defs(), memory_context=memory_section, @@ -763,12 +831,18 @@ async def _summarize_history(self) -> None: old_turns = self._history[:split] recent_turns = self._history[split:] - # Serialize old turns into text for summarization + # Serialize old turns. Pull out any prior compacted summary so we + # UPDATE it in place rather than summarize a summary (which compounds + # loss every compaction). + prior_summary = "" lines: list[str] = [] for msg in old_turns: role = msg.get("role", "unknown") content = msg.get("content", "") if isinstance(content, str): + if content.lstrip().startswith(_COMPACTED_MARKER): + prior_summary = content + continue lines.append(f"[{role}]: {content[:2000]}") elif isinstance(content, list): for block in content: @@ -789,17 +863,43 @@ async def _summarize_history(self) -> None: if len(old_text) > 8000: old_text = old_text[:8000] + "\n... (truncated)" + if prior_summary: + user_content = ( + "PREVIOUS SUMMARY (update this in place — merge the new turns into it, " + "don't restate it verbatim):\n" + f"{prior_summary}\n\n" + "NEW TURNS TO FOLD IN:\n" + f"{old_text}" + ) + else: + user_content = old_text + try: + # 3b-full: a structured, in-place-updated STATE RECORD rather than a + # freeform blob — so "Remaining" work survives compaction instead of + # being flattened into prose. summary_response = await self._llm.code( system=( - "Summarize this conversation history concisely. Preserve:\n" - "- Key decisions and conclusions\n" - "- Important data/results discovered\n" - "- Variable names and values that are still relevant\n" - "- Errors encountered and how they were resolved\n" - "Keep it under 2000 tokens. Use bullet points." + "You compact an agent's earlier conversation into a terse, factual " + "STATE RECORD (not prose). Output only these sections, omitting any " + "that are empty:\n" + "## Goal — what the user ultimately wants\n" + "## Constraints — explicit requirements / preferences / do-nots\n" + "## Completed — work already done, each as `action → outcome`\n" + "## Active state — variables, data, files/artifacts in play and their " + "current values or paths\n" + "## Blocked — anything stuck and why\n" + "## Decisions — choices made and the reason\n" + "## Remaining — what is still left to do\n\n" + "Preserve the date/time of key events when it matters (e.g. " + "`Completed (2026-06-05): …`) — the raw per-message timestamps are " + "gone after compaction, so keep the ones that anchor the timeline.\n" + "If a PREVIOUS SUMMARY is provided, update it with the new turns " + "instead of starting over. If the user changed direction, narrowed " + "scope, or cancelled something, reflect that — drop superseded items " + "from Remaining, don't keep them. Keep it under ~2000 tokens." ), - messages=[{"role": "user", "content": old_text}], + messages=[{"role": "user", "content": user_content}], max_tokens=2048, ) summary = summary_response.content or "(summary unavailable)" @@ -807,17 +907,26 @@ async def _summarize_history(self) -> None: # If summarization fails, just do a simple truncation summary = f"(Earlier conversation with {len(old_turns)} turns — summarization failed)" - summary_msg = { - "role": "user", - "content": f"[Context summary of earlier conversation]\n{summary}", - } + # 3b-light: reference-only framing so the model treats this as compacted + # history, not a fresh instruction, and never resumes superseded/cancelled + # work after a compaction (which Anton's auto-continue verifier would + # otherwise be nudged to do). + summary_body = ( + f"{_COMPACTED_MARKER}\n" + "Compacted record of earlier conversation, for REFERENCE ONLY — not a new " + "request. The most recent user message takes priority; if the user changed " + "direction, narrowed scope, or cancelled something, follow that and do NOT " + "resume superseded work described below.\n\n" + f"{summary}" + ) + summary_msg = {"role": "user", "content": summary_body} # If the recent portion starts with a user message, insert a minimal # assistant separator to avoid consecutive user messages (API error). if recent_turns and recent_turns[0].get("role") == "user": self._history = [ summary_msg, - {"role": "assistant", "content": "Understood."}, + {"role": "assistant", "content": "Understood — using that as reference."}, *recent_turns, ] else: @@ -1791,6 +1900,15 @@ async def _stream_and_handle_tools( description=description, cell=cell, ) + # Same post-execute ACC event as the CLI + # path (handle_scratchpad) — this inline + # streaming exec bypasses that handler, so + # without this scratchpad_killed/result + # would never fire here and detect_kill_loop + # would be blind in the streaming product. + observe_scratchpad_cell( + self, tc.input.get("name", ""), cell + ) yield StreamToolResult( name=tc.name, action="exec", diff --git a/anton/core/settings.py b/anton/core/settings.py index fb631a1b..46e6b07b 100644 --- a/anton/core/settings.py +++ b/anton/core/settings.py @@ -17,6 +17,8 @@ class CoreSettings(BaseSettings): cell_timeout_default: int = 120 # Total timeout when no estimate given (s) cell_inactivity_timeout: int = 30 # Max silence between output lines (s) cell_inactivity_after_progress: int = 60 # Grace window after progress() call (s) + cell_inactivity_max: int = 60 # Ceiling on the silence window even when a large estimate scales it up (s) + cell_total_max: int = 0 # Optional absolute ceiling on total cell runtime (s); 0 = off (let it scale) cell_install_timeout: int = 120 # pip/uv install timeout (s) cell_keep_recent: int = 5 # Recent cells preserved during compaction diff --git a/anton/core/tools/tool_defs.py b/anton/core/tools/tool_defs.py index 2b0b182c..c37a24b5 100644 --- a/anton/core/tools/tool_defs.py +++ b/anton/core/tools/tool_defs.py @@ -93,6 +93,10 @@ class ToolDef: "type": "integer", "description": "Estimated execution time in seconds. Drives the total timeout (roughly 2x estimate). Use progress() for long cells.", }, + "confirm_new_scratchpad": { + "type": "boolean", + "description": "Set true only to deliberately create a SECOND scratchpad while one is already in use this task. Normally reuse one scratchpad name for the whole task — each name is a separate isolated environment, so a new one loses all existing state. Leave unset/false unless you truly need isolation.", + }, }, "required": ["action", "name"], }, diff --git a/anton/core/tools/tool_handlers.py b/anton/core/tools/tool_handlers.py index c23ca94e..6c8625a2 100644 --- a/anton/core/tools/tool_handlers.py +++ b/anton/core/tools/tool_handlers.py @@ -4,7 +4,11 @@ from typing import TYPE_CHECKING from anton.core.backends.base import Cell -from anton.core.utils.scratchpad import prepare_scratchpad_exec, format_cell_result +from anton.core.utils.scratchpad import ( + prepare_scratchpad_exec, + format_cell_result, + observe_scratchpad_cell, +) if TYPE_CHECKING: from anton.chat_session import ChatSession @@ -408,25 +412,17 @@ def _acc_observe(kind: str, detail: dict, *, severity: int = 1) -> None: fn(kind, detail, severity=severity) if action == "exec": + # The single-scratchpad guard and the pre-execute ACC events + # (scratchpad_empty_code / scratchpad_call) live in + # prepare_scratchpad_exec — the SHARED entry point that the streaming + # path (ChatSession.turn_stream) also calls — so they fire on both + # paths. A str return is a message the call should not run past + # (empty code, single-scratchpad challenge, or install failure). result = await prepare_scratchpad_exec(session, tc_input) if isinstance(result, str): - # Empty / malformed code parameter — the dispatcher rejected - # it before reaching the runtime. This is exactly the - # "silent code-clip" failure mode the ACC's - # detect_oversized_cell watches for. - _acc_observe("scratchpad_empty_code", {"name": name}, severity=7) return result pad, code, description, estimated_time, estimated_seconds = result - _acc_observe( - "scratchpad_call", - { - "name": name, - "code_len": len(code or ""), - "one_line_description": description or "", - }, - ) - # Notify pre-execute observers (e.g. cerebellum). The runtime # never sees these — observation is an orchestration concern, # so it lives at the dispatcher layer where the data is most @@ -452,31 +448,9 @@ def _acc_observe(kind: str, detail: dict, *, severity: int = 1) -> None: pad_name=name, description=description, cell=cell, ) await _fire_post_execute(session, cell) - # ACC: distinguish "killed" (timeout/cancel/OOM) from a - # plain runtime error. The local backend sets cell.error - # to a string starting with "Cancelled" or matching the - # "Cell timed out"/"Cell killed" prefixes from the - # asyncio.TimeoutError path. Everything else (NameError, - # ImportError, …) is a regular result with success=False. - err = (cell.error or "").strip() - if err.startswith(("Cancelled", "Cell timed out", "Cell killed")): - _acc_observe( - "scratchpad_killed", - {"name": name, "reason": err[:120]}, - severity=6, - ) - else: - success = not err and not (cell.stderr or "").strip() - _acc_observe( - "scratchpad_result", - { - "name": name, - "success": success, - "stdout_len": len(cell.stdout or ""), - "error": err[:300] if err else "", - }, - severity=5 if not success else 1, - ) + # Post-execute ACC event (killed vs result) via the shared helper — + # the streaming path emits the same. + observe_scratchpad_cell(session, name, cell) return format_cell_result(cell) elif action == "view": diff --git a/anton/core/utils/scratchpad.py b/anton/core/utils/scratchpad.py index da518ff4..2cf15bc6 100644 --- a/anton/core/utils/scratchpad.py +++ b/anton/core/utils/scratchpad.py @@ -5,16 +5,107 @@ from anton.core.session import ChatSession +def _acc_observe(session, kind: str, detail: dict, *, severity: int = 1) -> None: + """Safe ACC emit — no-op if the session has no observer wired.""" + fn = getattr(session, "_acc_observe", None) + if fn is not None: + fn(kind, detail, severity=severity) + + +def observe_scratchpad_cell(session, name: str, cell) -> None: + """Emit the post-execute ACC event for a finished cell. + + Distinguishes a kill (timeout/cancel/OOM) from a plain runtime error so + detect_kill_loop sees `scratchpad_killed`. Shared by both exec paths — + `handle_scratchpad` (CLI `turn()`) and the inline streaming exec in + `ChatSession.turn_stream` — so the ACC instrumentation is identical + regardless of which path ran the cell. + """ + if cell is None: + return + err = (cell.error or "").strip() + if err.startswith(("Cancelled", "Cell timed out", "Cell killed")): + _acc_observe(session, "scratchpad_killed", {"name": name, "reason": err[:120]}, severity=6) + else: + success = not err and not (cell.stderr or "").strip() + _acc_observe( + session, + "scratchpad_result", + { + "name": name, + "success": success, + "stdout_len": len(cell.stdout or ""), + "error": err[:300] if err else "", + }, + severity=5 if not success else 1, + ) + + async def prepare_scratchpad_exec(session: ChatSession, tc_input: dict): """Validate and prepare a scratchpad exec call. Returns (pad, code, description, estimated_time, estimated_seconds) or - a str error message if validation fails. + a str message if the call should not run (empty code, a single-scratchpad + challenge, or a failed package install). + + This is the SHARED entry point for both exec paths — `handle_scratchpad` + (CLI) and the inline streaming exec in `ChatSession.turn_stream` (cowork) + both call it — so the single-scratchpad guard and the pre-execute ACC + events live here, not in `handle_scratchpad` (which the streaming path + bypasses). """ name = tc_input.get("name", "") code = tc_input.get("code", "") if not code or not code.strip(): - return "No code provided." + # An empty `code` on an exec call is almost never the model meaning + # to run nothing — it's the large-payload drop: an oversized `code` + # argument gets truncated to "" in transit. Returning a bare "no + # code" here used to read as a no-op, so the model would retry the + # same oversized cell. Make the failure self-correcting and ensure + # it reads as an error (note the word "failed") so the per-tool + # error streak in _apply_error_tracking counts it toward the + # circuit breaker instead of silently resetting. + _acc_observe(session, "scratchpad_empty_code", {"name": name}, severity=7) + return ( + "Scratchpad exec failed: the `code` argument was empty. This usually " + "means the code payload was too large and got truncated in transit. " + "Do NOT retry the same large cell — instead write the output to disk in " + "small append steps (open(path, 'a'), keep each cell's string under ~5KB), " + "or generate the content inside the cell rather than passing a big literal." + ) + + # Single-scratchpad guard: the agent should reuse ONE scratchpad per task. + # A new name spins up a separate, empty process — state from the existing + # pad isn't visible there — a common source of wasted rounds (re-import, + # re-fetch, shuffling state across pads). Challenge a new name when the + # agent already has a working scratchpad this session, unless it confirms + # it needs isolation. Tracked names are ones the agent has exec'd here — + # NOT session._scratchpads.pads, which also holds system-created pads + # (e.g. the artifact backend launcher's slug pad), which must never count + # against the agent. Challenge AT MOST ONCE per session: the challenge is + # not an error (it resets no streak), so re-challenging every new name + # could loop to the round cap with nothing to stop it; one firm nudge is + # the enforcement, then respect the model's choice. `is True` (not + # truthiness) so a MagicMock attr in tests doesn't read as "challenged". + seen = getattr(session, "_agent_scratchpad_names", None) + if not isinstance(seen, set): + seen = set() + session._agent_scratchpad_names = seen + confirm_new = bool(tc_input.get("confirm_new_scratchpad", False)) + challenged_before = getattr(session, "_scratchpad_challenged", False) is True + if name not in seen and seen and not confirm_new and not challenged_before: + session._scratchpad_challenged = True + existing = "', '".join(sorted(seen)) + return ( + f"You already have an active scratchpad ('{existing}') with live state " + f"(imports, variables, fetched data). Starting a new one named '{name}' " + "creates a SEPARATE, empty environment — nothing from the existing " + "scratchpad is available there, so you'd re-import and re-fetch. Reuse the " + "existing scratchpad for this task; it is stateful across cells. If you " + "genuinely need an isolated environment, call scratchpad exec again with " + "confirm_new_scratchpad=true." + ) + seen.add(name) pad = await session._scratchpads.get_or_create(name) @@ -34,6 +125,15 @@ async def prepare_scratchpad_exec(session: ChatSession, tc_input: dict): estimated_seconds = 0 estimated_time = f"{estimated_seconds}s" if estimated_seconds > 0 else "" + _acc_observe( + session, + "scratchpad_call", + { + "name": name, + "code_len": len(code or ""), + "one_line_description": description or "", + }, + ) return pad, code, description, estimated_time, estimated_seconds diff --git a/tests/e2e/scenarios/test_loop_safety.py b/tests/e2e/scenarios/test_loop_safety.py index 61d40deb..25fdf5a6 100644 --- a/tests/e2e/scenarios/test_loop_safety.py +++ b/tests/e2e/scenarios/test_loop_safety.py @@ -63,9 +63,12 @@ def test_session_exits_within_timeout(cfg, stub, tmp_path): @pytest.mark.stub_only def test_resilience_nudge_injected_after_two_errors(cfg, stub, tmp_path): + # Reuse ONE scratchpad name: a realistic retry loop is the same cell + # failing twice. (Distinct names would instead trip the single-scratchpad + # guard, which is exercised separately.) bad_code = "def oops(:\n pass" - stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad1", "code": bad_code}) - stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad2", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad", "code": bad_code}) stub.queue_text("NUDGE_RECEIVED") stub.queue_verification_ok() result = run_anton(["--folder", str(tmp_path)], ["do bad stuff", "exit"], @@ -82,9 +85,12 @@ def test_resilience_nudge_injected_after_two_errors(cfg, stub, tmp_path): @pytest.mark.stub_only def test_circuit_breaker_fires_after_five_consecutive_errors(cfg, stub, tmp_path): + # Reuse ONE scratchpad name so this exercises the consecutive-error + # circuit breaker, not the single-scratchpad guard (distinct names would + # trigger a guard challenge that resets the streak). bad_code = "def bad(:\n pass" for i in range(5): - stub.queue_tool_call("scratchpad", {"action": "exec", "name": f"err_{i}", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "err", "code": bad_code}) stub.queue_text("ERRORS_EXHAUSTED") stub.queue_verification_ok() result = run_anton(["--folder", str(tmp_path)], ["break everything", "exit"], diff --git a/tests/test_acc.py b/tests/test_acc.py index 2ed7f114..448b2133 100644 --- a/tests/test_acc.py +++ b/tests/test_acc.py @@ -244,12 +244,16 @@ def test_fires_on_two_kills_same_name(self): assert lesson is not None assert lesson.detector == "detect_kill_loop" - def test_silent_when_kills_are_for_different_names(self): + def test_fires_on_kills_across_different_names(self): + # Renaming the scratchpad between failed attempts must NOT hide the + # loop — two kills in a turn fire regardless of name. events = [ Event("scratchpad_killed", 6, {"name": "a", "reason": "timeout"}, 1), Event("scratchpad_killed", 6, {"name": "b", "reason": "timeout"}, 2), ] - assert detect_kill_loop(events) is None + lesson = detect_kill_loop(events) + assert lesson is not None + assert lesson.detector == "detect_kill_loop" def test_silent_on_single_kill(self): events = [Event("scratchpad_killed", 6, {"name": "compute"}, 3)] diff --git a/tests/test_dispatch_error_message.py b/tests/test_dispatch_error_message.py new file mode 100644 index 00000000..04d7ea75 --- /dev/null +++ b/tests/test_dispatch_error_message.py @@ -0,0 +1,32 @@ +"""`_safe_error_message` framing policy for the local dispatch loop. + +A spent token allowance is a quota condition, not a crash, so it must +surface anton's already-friendly message verbatim — without the +`[agent error]` prefix that reads like something broke. Every other +failure keeps the prefix (and the API-key redaction it already applied). +""" + +from __future__ import annotations + +from anton.core.dispatch.local_runtime import LocalScratchpadOrchestrator +from anton.core.llm.provider import TokenLimitExceeded + + +_TOKEN_LIMIT_MESSAGE = ( + "Server returned 429 — Monthly limit exceeded for tokens: 5000000/5000000 " + "Visit https://console.mindshub.ai to upgrade or to top up your tokens." +) + + +def test_token_limit_message_has_no_agent_error_prefix(): + rendered = LocalScratchpadOrchestrator._safe_error_message( + TokenLimitExceeded(_TOKEN_LIMIT_MESSAGE) + ) + assert rendered == _TOKEN_LIMIT_MESSAGE + assert "[agent error]" not in rendered + + +def test_generic_error_keeps_agent_error_prefix(): + rendered = LocalScratchpadOrchestrator._safe_error_message(ValueError("boom")) + assert rendered.startswith("[agent error]") + assert "boom" in rendered diff --git a/tests/test_resilience_nudge.py b/tests/test_resilience_nudge.py new file mode 100644 index 00000000..7a62a959 --- /dev/null +++ b/tests/test_resilience_nudge.py @@ -0,0 +1,45 @@ +"""Tests for ChatSession._select_resilience_nudge — failure-type-aware nudging. + +The generic RESILIENCE_NUDGE is scrape/fetch advice and misdirects scratchpad +failures (a too-big or too-slow cell doesn't need a different data source). The +selector routes scratchpad size/timeout failures to specific guidance and keeps +the generic nudge for everything else. +""" + +from __future__ import annotations + +from anton.core.llm.prompts import ( + RESILIENCE_NUDGE, + SCRATCHPAD_SIZE_NUDGE, + SCRATCHPAD_TIMEOUT_NUDGE, +) +from anton.core.session import ChatSession + +_select = ChatSession._select_resilience_nudge + + +class TestSelectResilienceNudge: + def test_non_scratchpad_tool_gets_generic_nudge(self): + assert _select("web_fetch", "failed to fetch the page") == RESILIENCE_NUDGE + + def test_scratchpad_timeout_gets_timeout_nudge(self): + assert _select("scratchpad", "Cell timed out after 180s total") == SCRATCHPAD_TIMEOUT_NUDGE + + def test_scratchpad_inactivity_gets_timeout_nudge(self): + msg = "Cell killed after 60s of inactivity (no output or progress() calls)" + assert _select("scratchpad", msg) == SCRATCHPAD_TIMEOUT_NUDGE + + def test_scratchpad_empty_code_gets_size_nudge(self): + msg = "Scratchpad exec failed: the `code` argument was empty. ..." + assert _select("scratchpad", msg) == SCRATCHPAD_SIZE_NUDGE + + def test_scratchpad_generic_error_gets_generic_nudge(self): + # A NameError-style failure is neither size nor timeout; it still gets + # the generic "failed twice, change approach" nudge (only size/timeout + # get specialised scratchpad advice). + assert _select("scratchpad", "[error]\nNameError: name 'data' is not defined") == RESILIENCE_NUDGE + + def test_scratchpad_nudges_never_mention_scraping(self): + for nudge in (SCRATCHPAD_SIZE_NUDGE, SCRATCHPAD_TIMEOUT_NUDGE): + assert "archive.org" not in nudge + assert "data source" not in nudge diff --git a/tests/test_scratchpad.py b/tests/test_scratchpad.py index cd08d65a..ea9d580b 100644 --- a/tests/test_scratchpad.py +++ b/tests/test_scratchpad.py @@ -831,28 +831,53 @@ async def test_compute_timeouts_no_estimate(self): assert inactivity == 30.0 async def test_compute_timeouts_with_estimate(self): - """Estimate should scale total timeout and inactivity with no hard cap.""" + """Estimate scales the total with no cap; inactivity is clamped to cell_inactivity_max (default 60).""" from anton.core.backends.utils import compute_timeouts as _compute_timeouts # Small estimate: max(10*2, 10+30) = max(20, 40) = 40 total, inactivity = _compute_timeouts(10) assert total == 40.0 - assert inactivity == 30.0 # max(5, 30) = 30 + assert inactivity == 30.0 # max(5, 30) = 30, under the cap # Medium estimate: max(60*2, 60+30) = max(120, 90) = 120 total, inactivity = _compute_timeouts(60) assert total == 120.0 - assert inactivity == 30.0 # max(30, 30) = 30 + assert inactivity == 30.0 # max(30, 30) = 30, under the cap - # Large estimate: max(300*2, 300+30) = max(600, 330) = 600 + # Large estimate: total still scales, inactivity is capped at 60 total, inactivity = _compute_timeouts(300) assert total == 600.0 - assert inactivity == 150.0 # max(150, 30) = 150 + assert inactivity == 60.0 # min(max(150, 30), 60) = 60 - # Very large estimate: scales with estimate + # Very large estimate: total keeps scaling so long-but-active cells + # can run; the silence window stays capped. total, inactivity = _compute_timeouts(1000) assert total == 2000.0 - assert inactivity == 500.0 # max(500, 30) = 500 + assert inactivity == 60.0 # min(max(500, 30), 60) = 60 + + async def test_compute_timeouts_inactivity_cap_is_configurable(self): + """cell_inactivity_max bounds the silence window regardless of estimate.""" + from anton.core.backends import utils as _utils + from anton.core.settings import CoreSettings + + # est=300 would scale inactivity to 150s without the cap; with the + # default cap (60) it is clamped, and the cap is tunable via settings. + total, inactivity = _utils.compute_timeouts(300) + assert inactivity == float(CoreSettings().cell_inactivity_max) + assert total == 600.0 # total is intentionally left uncapped + + async def test_compute_timeouts_total_max_off_by_default(self): + """cell_total_max defaults to 0 — the total is uncapped out of the box.""" + from anton.core.settings import CoreSettings + assert CoreSettings().cell_total_max == 0 + + async def test_compute_timeouts_total_max_backstop(self, monkeypatch): + """When set, cell_total_max bounds the total; inactivity stays capped.""" + from anton.core.backends.utils import compute_timeouts as _compute_timeouts + monkeypatch.setenv("ANTON_CELL_TOTAL_MAX", "300") + total, inactivity = _compute_timeouts(1000) + assert total == 300.0 # min(2000, 300) + assert inactivity == 60.0 class TestSampleFunction: diff --git a/tests/test_scratchpad_observer_dispatch.py b/tests/test_scratchpad_observer_dispatch.py index 1a99c4a5..f80a2a13 100644 --- a/tests/test_scratchpad_observer_dispatch.py +++ b/tests/test_scratchpad_observer_dispatch.py @@ -23,6 +23,59 @@ _fire_pre_execute, handle_scratchpad, ) +from anton.core.utils.scratchpad import observe_scratchpad_cell + + +class _RecordingAccSession: + """Session stub that records ACC observations.""" + + def __init__(self): + self.events: list[tuple] = [] + + def _acc_observe(self, kind, detail, *, severity=1): + self.events.append((kind, detail, severity)) + + +class TestObserveScratchpadCell: + """observe_scratchpad_cell is the shared post-exec ACC emitter used by + BOTH the CLI (handle_scratchpad) and streaming (turn_stream) paths.""" + + def test_timeout_kill_emits_scratchpad_killed(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Cell timed out after 180s total. Process killed") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_killed" + assert s.events[0][1]["name"] == "dash" + + def test_inactivity_kill_emits_scratchpad_killed(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Cell killed after 60s of inactivity") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_killed" + + def test_runtime_error_emits_result_failure(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Traceback...\nNameError: x") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_result" + assert s.events[0][1]["success"] is False + + def test_success_emits_result_success(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="42", stderr="", error=None) + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_result" + assert s.events[0][1]["success"] is True + + def test_none_cell_emits_nothing(self): + s = _RecordingAccSession() + observe_scratchpad_cell(s, "dash", None) + assert s.events == [] + + def test_no_acc_observer_is_noop(self): + # A session without _acc_observe (e.g. ACC off) must not raise. + observe_scratchpad_cell(SimpleNamespace(), "dash", + Cell(code="x", stdout="", stderr="", error=None)) # ───────────────────────────────────────────────────────────────────────────── @@ -288,3 +341,82 @@ async def test_non_exec_actions_do_not_fire_observers(self): assert obs.pre_calls == [] assert obs.post_calls == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Single-scratchpad guard — challenge a second distinct scratchpad per task +# ───────────────────────────────────────────────────────────────────────────── + +_CHALLENGE_MARK = "confirm_new_scratchpad=true" + + +class TestSingleScratchpadGuard: + def _exec(self, name: str, **extra: object) -> dict: + tc = { + "action": "exec", + "name": name, + "code": "print(1)", + "one_line_description": "do a thing", + "estimated_execution_time_seconds": 5, + } + tc.update(extra) + return tc + + @pytest.mark.asyncio + async def test_first_scratchpad_not_challenged(self): + session, _ = _fake_session() + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result + assert "dash" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_reusing_same_name_not_challenged(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result + + @pytest.mark.asyncio + async def test_second_distinct_name_is_challenged(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad(session, self._exec("report")) + assert _CHALLENGE_MARK in result + # The challenged name must NOT be recorded, so a later confirm works. + assert "report" not in session._agent_scratchpad_names + # A challenge is not a failure — it must not contain an error marker + # that would trip the per-tool circuit breaker. + assert "failed" not in result and "[error]" not in result + + @pytest.mark.asyncio + async def test_confirm_allows_second_scratchpad(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad( + session, self._exec("report", confirm_new_scratchpad=True) + ) + assert _CHALLENGE_MARK not in result + assert "report" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_challenge_fires_at_most_once_per_session(self): + # The challenge must not be able to induce its own loop: a model that + # keeps requesting new names without confirming is nudged once, then + # allowed (the challenge isn't an error, so nothing else would stop it). + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + first = await handle_scratchpad(session, self._exec("report")) + assert _CHALLENGE_MARK in first + second = await handle_scratchpad(session, self._exec("report2")) + assert _CHALLENGE_MARK not in second + assert "report2" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_system_pads_do_not_count_against_agent(self): + # A system-created pad (e.g. the artifact backend launcher's slug pad) + # lives in _scratchpads.pads but never in _agent_scratchpad_names, so + # the agent's first real scratchpad is not challenged by its presence. + session, _ = _fake_session() + session._scratchpads.pads = {"my-artifact-slug": MagicMock()} + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result diff --git a/uv.lock b/uv.lock index 002d1055..08c2abc4 100644 --- a/uv.lock +++ b/uv.lock @@ -164,7 +164,7 @@ wheels = [ ] [[package]] -name = "anton" +name = "anton-agent" source = { editable = "." } dependencies = [ { name = "aiohttp" },