From f0f570819a3c4325c009c6cd3560cc240e88224e Mon Sep 17 00:00:00 2001 From: Konstantin Sivakov Date: Thu, 18 Jun 2026 12:57:49 +0200 Subject: [PATCH 1/9] return friendly message when tokens are exceeded --- anton/core/dispatch/local_runtime.py | 6 ++++++ anton/core/llm/anthropic.py | 4 ++-- anton/core/llm/openai.py | 8 ++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/anton/core/dispatch/local_runtime.py b/anton/core/dispatch/local_runtime.py index 5430f056..63e8d63a 100644 --- a/anton/core/dispatch/local_runtime.py +++ b/anton/core/dispatch/local_runtime.py @@ -442,6 +442,12 @@ def _safe_error_message(exc: Exception) -> str: """Render an exception as a user-facing error with API keys redacted.""" try: from anton.core.runtime import safe_redact_error + from anton.core.llm.provider import TokenLimitExceeded + # A spent token allowance isn't a crash — surface anton's + # already-friendly quota message as-is, without the + # `[agent error]` prefix that reads like something broke. + if isinstance(exc, TokenLimitExceeded): + return safe_redact_error(exc) return f"[agent error] {safe_redact_error(exc)}" except Exception: return f"[agent error] {exc!r}" diff --git a/anton/core/llm/anthropic.py b/anton/core/llm/anthropic.py index cc284d2d..7ca41555 100644 --- a/anton/core/llm/anthropic.py +++ b/anton/core/llm/anthropic.py @@ -127,7 +127,7 @@ async def complete( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -274,7 +274,7 @@ async def stream( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc diff --git a/anton/core/llm/openai.py b/anton/core/llm/openai.py index 89064e07..c2f0549c 100644 --- a/anton/core/llm/openai.py +++ b/anton/core/llm/openai.py @@ -683,7 +683,7 @@ async def complete( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -852,7 +852,7 @@ async def stream( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -970,7 +970,7 @@ async def _complete_via_responses( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or to top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or to top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc @@ -1099,7 +1099,7 @@ async def _stream_via_responses( and exc.body.get("detail") ): msg = f"Server returned 429 — {exc.body['detail']}" - msg += " Visit https://mdb.ai to upgrade or top up your tokens." + msg += " Visit https://console.mindshub.ai to upgrade or top up your tokens." from .provider import TokenLimitExceeded raise TokenLimitExceeded(msg) from exc From ea1cd9392138930b1004a692dbbf417c02dbc746 Mon Sep 17 00:00:00 2001 From: Konstantin Sivakov Date: Thu, 18 Jun 2026 12:57:49 +0200 Subject: [PATCH 2/9] return friendly message when tokens are exceeded From 97e04ae9b770c64ffa9f9faefbdc749becf04b73 Mon Sep 17 00:00:00 2001 From: Konstantin Sivakov Date: Thu, 18 Jun 2026 16:34:04 +0200 Subject: [PATCH 3/9] Add tests for checking the messages --- tests/test_dispatch_error_message.py | 32 ++++++++++++++++++++++++++++ uv.lock | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 tests/test_dispatch_error_message.py diff --git a/tests/test_dispatch_error_message.py b/tests/test_dispatch_error_message.py new file mode 100644 index 00000000..04d7ea75 --- /dev/null +++ b/tests/test_dispatch_error_message.py @@ -0,0 +1,32 @@ +"""`_safe_error_message` framing policy for the local dispatch loop. + +A spent token allowance is a quota condition, not a crash, so it must +surface anton's already-friendly message verbatim — without the +`[agent error]` prefix that reads like something broke. Every other +failure keeps the prefix (and the API-key redaction it already applied). +""" + +from __future__ import annotations + +from anton.core.dispatch.local_runtime import LocalScratchpadOrchestrator +from anton.core.llm.provider import TokenLimitExceeded + + +_TOKEN_LIMIT_MESSAGE = ( + "Server returned 429 — Monthly limit exceeded for tokens: 5000000/5000000 " + "Visit https://console.mindshub.ai to upgrade or to top up your tokens." +) + + +def test_token_limit_message_has_no_agent_error_prefix(): + rendered = LocalScratchpadOrchestrator._safe_error_message( + TokenLimitExceeded(_TOKEN_LIMIT_MESSAGE) + ) + assert rendered == _TOKEN_LIMIT_MESSAGE + assert "[agent error]" not in rendered + + +def test_generic_error_keeps_agent_error_prefix(): + rendered = LocalScratchpadOrchestrator._safe_error_message(ValueError("boom")) + assert rendered.startswith("[agent error]") + assert "boom" in rendered diff --git a/uv.lock b/uv.lock index 002d1055..08c2abc4 100644 --- a/uv.lock +++ b/uv.lock @@ -164,7 +164,7 @@ wheels = [ ] [[package]] -name = "anton" +name = "anton-agent" source = { editable = "." } dependencies = [ { name = "aiohttp" }, From fdd41ac9250fe7aa39162024dd0eb8d99cf2989d Mon Sep 17 00:00:00 2001 From: Alejandro Cantu Date: Thu, 18 Jun 2026 12:26:32 -0700 Subject: [PATCH 4/9] =?UTF-8?q?ENG-350:=20scratchpad=20guardrails=20?= =?UTF-8?q?=E2=80=94=20stop=20the=20large-output=20retry=20loop=20(#190)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ENG-350: cap scratchpad inactivity window regardless of estimate The inactivity timer scaled unbounded with the cell's estimate (inactivity = est*0.5), so an over-estimated cell (e.g. est=600) allowed 5 minutes of *silent* execution before being killed — a core cause of cells appearing to "run forever" with no output. Clamp the silence window to cell_inactivity_max (default 60s, tunable via ANTON_CELL_INACTIVITY_MAX). stdout/progress() still reset the window, so legitimate long-but-active cells (e.g. a batch loop pinging progress()) are unaffected. The total timeout is deliberately left scaling so those active cells can run to completion; only genuinely stuck/silent cells die fast now. Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: self-correcting empty-code failure + name-agnostic kill-loop detector Two related fixes for the large-output retry loop: 1. Empty `code` on an exec is the large-payload drop (oversized arg truncated to "" in transit), not a no-op. Replace the bare "No code provided." with actionable recovery guidance (write to disk in small append steps, or generate in-cell) and phrase it as a failure so the per-tool error streak in _apply_error_tracking counts it toward the circuit breaker instead of silently resetting on every retry. 2. detect_kill_loop now fires on >= N kills across the turn regardless of scratchpad name, not just N kills on one name. Renaming the scratchpad between failed attempts used to split the count across buckets and hide the loop. Updated the corresponding test to assert the new behavior. Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: make the resilience nudge failure-type-aware The generic RESILIENCE_NUDGE is scrape/fetch advice ("try a public API / archive.org / different headers"). Appended to a repeated *scratchpad* failure it misdirects — a cell that's too big or too slow doesn't need a different data source, it needs to be chunked or scoped down, which is what pushed the model toward rename-and-retry churn. Add SCRATCHPAD_SIZE_NUDGE and SCRATCHPAD_TIMEOUT_NUDGE and route by failure type in _select_resilience_nudge: scratchpad timeout -> "make the cell smaller / split the loop / use progress()"; scratchpad empty-code/too-big -> "write incrementally or generate in-cell"; generic scratchpad errors get no (misleading) nudge; all other tools keep the generic nudge. Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: challenge a second scratchpad per task (enforce single-scratchpad) The agent frequently spins up multiple scratchpads for one task (build_pres -> write_html -> pres1 ...). Each name is a separate isolated process, so state from one isn't visible in another — the model re-imports, re-fetches, and shuffles state across pads, burning rounds. The prompt already says to use ONE scratchpad; this enforces it. handle_scratchpad now challenges an exec on a NEW scratchpad name when the agent already has one in use this task, returning guidance to reuse the existing pad. An explicit confirm_new_scratchpad=true (new optional schema field) bypasses it for the rare genuine-isolation case. Names are tracked in session._agent_scratchpad_names (only names the agent exec'd), NOT _scratchpads.pads — so system-created pads (e.g. the artifact backend launcher's slug pad) never count against the agent. The challenge carries no error marker so it doesn't trip the circuit breaker. Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: adversarial-review hardening Fixes found by red-teaming the PR: 1. Single-scratchpad guard could induce its own loop. The challenge returns a non-error string, so a model that keeps requesting new names without confirming would be re-challenged every round with nothing to stop it (the challenge resets no streak; circuit breaker never fires). Now challenge AT MOST ONCE per session (session._scratchpad_challenged), then respect the model's choice — one firm nudge is the enforcement. 2. Failure-type nudge over-matched. Keying the size nudge on "too large"/ "truncated" would misfire on unrelated errors (e.g. a MySQL "Data truncated for column" warning). Match the empty-code message phrase ("argument was empty") specifically. 3. No total backstop for an actively-printing runaway. The inactivity cap can't catch a cell that keeps producing output (while True: print(...)). Add optional cell_total_max (default 0 = off) so operators can bound total runtime without clipping legit long batch loops; apply the inactivity cap consistently to both estimate branches. Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: make the guard + scratchpad ACC events fire on the streaming path too Review finding: anton has two exec paths. turn() (CLI) routes through handle_scratchpad; turn_stream() (what cowork/cowork-server uses) handles exec inline and bypasses handle_scratchpad. So the single-scratchpad guard and the scratchpad ACC events (scratchpad_call/killed/empty_code) — all of which lived in handle_scratchpad — never fired in the streaming product, leaving detect_kill_loop / detect_oversized_cell / detect_name_switch blind there and the guard inert. Fix by centralizing on the shared entry points both paths already call: - Move the single-scratchpad guard and the pre-execute ACC events (scratchpad_empty_code, scratchpad_call) into prepare_scratchpad_exec. - Add observe_scratchpad_cell() for the post-execute event (killed vs result) and call it from BOTH handle_scratchpad and the streaming exec block. handle_scratchpad now just delegates; net ACC events emitted on the CLI path are unchanged. Incidental fix: a failed package-install string no longer mis-emits scratchpad_empty_code (the emit is now scoped to the empty-code branch inside prepare). Co-Authored-By: Claude Opus 4.8 (1M context) * ENG-350: default ANTON_ACC_MODE=active (mid-turn self-correction on by default) Team decision: ship the ACC mid-turn nudge on by default rather than gate it behind an eval — an off-by-default flag rots into forgotten noise, and the self-correction is most of the value. Revert path stays one env var (ANTON_ACC_MODE=passive to learn-next-turn, =off to disable). Running the e2e suite under the new default surfaced an interaction from the earlier guardrail work (not the flip itself): - _select_resilience_nudge returned "" for a generic scratchpad error, suppressing the nudge entirely. The ticket only called for avoiding scraping advice on size/timeout failures — a generic syntax/runtime error should still get the generic "failed twice, change approach" nudge. Fixed. - The two loop-safety e2e scenarios queued errors across DISTINCT scratchpad names, which now trips the single-scratchpad guard (challenge resets the streak) and conflates with the breaker/nudge being tested. Reused one name so they exercise the consecutive-error path they're actually about; the guard has its own tests. Full suite green: 35 e2e + 181 unit. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- anton/core/backends/utils.py | 24 +++- anton/core/llm/prompts.py | 23 +++- anton/core/memory/acc.py | 27 +++-- anton/core/session.py | 76 +++++++++--- anton/core/settings.py | 2 + anton/core/tools/tool_defs.py | 4 + anton/core/tools/tool_handlers.py | 54 +++------ anton/core/utils/scratchpad.py | 104 +++++++++++++++- tests/e2e/scenarios/test_loop_safety.py | 12 +- tests/test_acc.py | 8 +- tests/test_resilience_nudge.py | 45 +++++++ tests/test_scratchpad.py | 39 ++++-- tests/test_scratchpad_observer_dispatch.py | 132 +++++++++++++++++++++ 13 files changed, 466 insertions(+), 84 deletions(-) create mode 100644 tests/test_resilience_nudge.py diff --git a/anton/core/backends/utils.py b/anton/core/backends/utils.py index 07cd1796..0d87b8ff 100644 --- a/anton/core/backends/utils.py +++ b/anton/core/backends/utils.py @@ -8,7 +8,23 @@ def compute_timeouts(estimated_seconds: int) -> tuple[float, float]: """ s = CoreSettings() if estimated_seconds <= 0: - return float(s.cell_timeout_default), float(s.cell_inactivity_timeout) - total = max(estimated_seconds * 2, estimated_seconds + 30) - inactivity = max(estimated_seconds * 0.5, 30) - return float(total), float(inactivity) \ No newline at end of file + total = float(s.cell_timeout_default) + inactivity = float(s.cell_inactivity_timeout) + else: + total = float(max(estimated_seconds * 2, estimated_seconds + 30)) + inactivity = float(max(estimated_seconds * 0.5, 30)) + # Clamp the silence window: a large estimate must not buy minutes of + # undetected silence (an est=600 cell would otherwise allow 300s of no + # output before being killed). A cell quiet for cell_inactivity_max + # seconds is killed regardless of its estimate. stdout/progress() reset + # this window, so legitimate long-but-active cells — e.g. a batch loop + # pinging progress() — are unaffected; only genuinely stuck cells die. + inactivity = min(inactivity, float(s.cell_inactivity_max)) + # The total is deliberately left scaling so long-but-active cells run to + # completion. cell_total_max (default 0 = off) is an optional absolute + # backstop for a runaway that keeps producing output forever (which the + # inactivity cap can't catch); set it only when that risk outweighs + # clipping a genuinely long batch job. + if s.cell_total_max > 0: + total = min(total, float(s.cell_total_max)) + return total, inactivity \ No newline at end of file diff --git a/anton/core/llm/prompts.py b/anton/core/llm/prompts.py index 245a48c1..6a20468e 100644 --- a/anton/core/llm/prompts.py +++ b/anton/core/llm/prompts.py @@ -322,8 +322,8 @@ Do NOT build a single 20KB+ HTML string in memory and write it at the end. 3. CAP STRING SIZE PER CELL at ~5KB. Large-string scratchpad calls are the \ single biggest cause of silent failures (the tool occasionally drops the \ -`code` payload on oversized inputs and returns "No code provided", which still \ -counts against the round cap). If a section is too big, split it. +`code` payload on oversized inputs and the cell comes back with an empty-code \ +error, which still counts against the round cap). If a section is too big, split it. 4. NEVER re-emit the full HTML mid-build. Append deltas, don't re-print \ the world. Assembly is a one-line concat at the end, not a re-render of \ everything you've written so far. @@ -810,3 +810,22 @@ async def hello(): "a public API, archive.org, an alternate library, or a completely different data source. " "Only involve the user if the problem truly requires something only they can provide." ) + +# Scratchpad failures need different advice than the generic (scrape/fetch) +# RESILIENCE_NUDGE above — telling the model to "try a public API / archive.org" +# when a cell is too big or too slow just sends it renaming-and-retrying. These +# are chosen by failure type in ChatSession._apply_error_tracking. +SCRATCHPAD_SIZE_NUDGE = ( + "\n\nSYSTEM: This scratchpad cell keeps failing on its size, not its logic. " + "Stop retrying the same large cell. Write the output to disk incrementally — " + "open(path, 'w') once, then open(path, 'a') to append each chunk, keeping each " + "cell's string under ~5KB — or generate the content inside the cell instead of " + "passing a large literal. Reuse the SAME scratchpad; do not rename it." +) +SCRATCHPAD_TIMEOUT_NUDGE = ( + "\n\nSYSTEM: This scratchpad cell keeps timing out — the work is too heavy, not " + "the write. Make the next cell smaller: fewer rows/items per cell, split a long " + "loop across cells (process a batch, return, continue), or narrow the scope. Call " + "progress() inside long loops so active work isn't mistaken for a hang. Reuse the " + "SAME scratchpad; do not rename it." +) diff --git a/anton/core/memory/acc.py b/anton/core/memory/acc.py index 17c35e72..61b88cb1 100644 --- a/anton/core/memory/acc.py +++ b/anton/core/memory/acc.py @@ -446,26 +446,33 @@ def detect_reset_churn(events: Sequence[Event]) -> Lesson | None: def detect_kill_loop(events: Sequence[Event]) -> Lesson | None: - """The same scratchpad name had >= N cells killed (timeout/cancel/OOM). + """>= N scratchpad cells were killed (timeout/cancel/OOM) in one turn. + + Fires when a single scratchpad is killed >= N times (a per-pad loop) OR + when >= N cells are killed across the turn regardless of name. The + name-agnostic count is deliberate: renaming the scratchpad between failed + attempts (`build_pres` → `write_html` → …) used to split the kill count + across buckets and hide the loop. A kill is a kill, and the right lesson + (make the next cell smaller) is the same either way. Reads `kind == "scratchpad_killed"`; looks at `detail.name`. """ + killed = [e for e in events if e.kind == "scratchpad_killed"] by_name: defaultdict[str, int] = defaultdict(int) - for e in events: - if e.kind != "scratchpad_killed": - continue + for e in killed: n = e.detail.get("name") or "" if n: by_name[n] += 1 - if not by_name or max(by_name.values()) < _KILL_LOOP_THRESHOLD: + per_name_max = max(by_name.values()) if by_name else 0 + if per_name_max < _KILL_LOOP_THRESHOLD and len(killed) < _KILL_LOOP_THRESHOLD: return None return Lesson( rule=( - "When a scratchpad cell is killed (timeout, cancel, OOM), " - "the next cell on the same scratchpad needs to be smaller — " - "fewer rows, smaller batch, explicit timeout, narrower scope. " - "Two kills on the same scratchpad means the approach itself is " - "too heavy, not that the same cell needs another try." + "When a scratchpad cell is killed (timeout, cancel, OOM), the next " + "cell needs to be smaller — fewer rows, smaller batch, explicit " + "timeout, narrower scope — and stay on the SAME scratchpad. Two " + "kills in a turn (even across renamed scratchpads) mean the approach " + "is too heavy, not that the same cell needs another try." ), kind="when", triggers=("scratchpad_killed",), diff --git a/anton/core/session.py b/anton/core/session.py index 58abf956..ebe0f5f7 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -17,7 +17,11 @@ from anton.core.memory.cerebellum import Cerebellum from anton.core.memory.skills import SkillStore from anton.core.tools.recall_skill import RECALL_SKILL_TOOL -from anton.core.llm.prompts import RESILIENCE_NUDGE +from anton.core.llm.prompts import ( + RESILIENCE_NUDGE, + SCRATCHPAD_SIZE_NUDGE, + SCRATCHPAD_TIMEOUT_NUDGE, +) from anton.core.llm.provider import ( ContextOverflowError, StreamComplete, @@ -48,7 +52,11 @@ UPDATE_ARTIFACT_METADATA_TOOL, ToolDef, ) -from anton.core.utils.scratchpad import prepare_scratchpad_exec, format_cell_result +from anton.core.utils.scratchpad import ( + prepare_scratchpad_exec, + format_cell_result, + observe_scratchpad_cell, +) from anton.explainability import ExplainabilityCollector, ExplainabilityStore @@ -225,16 +233,17 @@ def _acc_has_similar(rule: str) -> bool: # turn. Mirrors ANTON_MEMORY_MODE for shape consistency: # "off" — ACC observes nothing (skipped at every emit site). # "passive" — Layer 1: lessons drain to memory at end-of-turn, - # next turn's system prompt picks them up. SAFE - # DEFAULT — adds no surface-area to the turn loop. - # "active" — Layer 2: ALSO inject lessons inline as text - # blocks in tool_results so the LLM sees them on - # the very next round. Stronger learning signal, - # but more invasive — the LLM has to handle the - # nudge gracefully without confusing it for a - # user instruction. - _mode_raw = os.environ.get("ANTON_ACC_MODE", "passive").strip().lower() - self._acc_mode = _mode_raw if _mode_raw in ("off", "passive", "active") else "passive" + # next turn's system prompt picks them up. No + # surface-area on the turn loop. + # "active" — Layer 2 (DEFAULT): ALSO inject lessons inline as + # text blocks in tool_results so the LLM sees them on + # the very next round and can self-correct mid-task. + # Stronger signal; the nudge is clearly labelled as an + # automatic self-check (not a user instruction). Set + # ANTON_ACC_MODE=passive to revert to learn-next-turn, + # or =off to disable, if it ever causes trouble. + _mode_raw = os.environ.get("ANTON_ACC_MODE", "active").strip().lower() + self._acc_mode = _mode_raw if _mode_raw in ("off", "passive", "active") else "active" # Scratchpad observers — list of objects with on_pre_execute / # on_post_execute. Fired by handle_scratchpad around pad.execute. # The runtime never sees this list; observation lives at the @@ -303,8 +312,10 @@ def _apply_error_tracking( streak = error_streak.get(tool_name, 0) if streak >= self._resilience_nudge_at and tool_name not in resilience_nudged: - result_text += RESILIENCE_NUDGE - resilience_nudged.add(tool_name) + nudge = self._select_resilience_nudge(tool_name, result_text) + if nudge: + result_text += nudge + resilience_nudged.add(tool_name) if streak >= self._max_consecutive_errors: result_text += ( @@ -315,6 +326,34 @@ def _apply_error_tracking( return result_text + @staticmethod + def _select_resilience_nudge(tool_name: str, result_text: str) -> str: + """Pick the right soft-nudge for a repeated failure. + + The generic RESILIENCE_NUDGE is scrape/fetch advice ("try a public + API / archive.org / different headers"). That actively misdirects a + scratchpad failure: a cell that's too big or too slow doesn't need a + different data source, it needs to be chunked or scoped down. Route + scratchpad failures to size/timeout-specific guidance by inspecting + the error text; a generic scratchpad error (e.g. a SyntaxError) and + every non-scratchpad tool keep the generic nudge. + """ + if tool_name != "scratchpad": + return RESILIENCE_NUDGE + low = result_text.lower() + if "timed out" in low or "inactivity" in low: + return SCRATCHPAD_TIMEOUT_NUDGE + # Match the empty-code dispatcher message specifically — generic + # phrases like "too large"/"truncated" appear in unrelated errors + # (e.g. a MySQL "Data truncated for column" warning) and would + # misfire the chunking advice. + if "argument was empty" in low: + return SCRATCHPAD_SIZE_NUDGE + # Other scratchpad failures (syntax/runtime errors): the generic + # "you've failed twice, change approach" nudge still applies — only + # the size/timeout cases get specialised advice. + return RESILIENCE_NUDGE + def repair_history(self) -> None: """Fix dangling tool_use blocks left by mid-stream cancellation. @@ -1791,6 +1830,15 @@ async def _stream_and_handle_tools( description=description, cell=cell, ) + # Same post-execute ACC event as the CLI + # path (handle_scratchpad) — this inline + # streaming exec bypasses that handler, so + # without this scratchpad_killed/result + # would never fire here and detect_kill_loop + # would be blind in the streaming product. + observe_scratchpad_cell( + self, tc.input.get("name", ""), cell + ) yield StreamToolResult( name=tc.name, action="exec", diff --git a/anton/core/settings.py b/anton/core/settings.py index fb631a1b..46e6b07b 100644 --- a/anton/core/settings.py +++ b/anton/core/settings.py @@ -17,6 +17,8 @@ class CoreSettings(BaseSettings): cell_timeout_default: int = 120 # Total timeout when no estimate given (s) cell_inactivity_timeout: int = 30 # Max silence between output lines (s) cell_inactivity_after_progress: int = 60 # Grace window after progress() call (s) + cell_inactivity_max: int = 60 # Ceiling on the silence window even when a large estimate scales it up (s) + cell_total_max: int = 0 # Optional absolute ceiling on total cell runtime (s); 0 = off (let it scale) cell_install_timeout: int = 120 # pip/uv install timeout (s) cell_keep_recent: int = 5 # Recent cells preserved during compaction diff --git a/anton/core/tools/tool_defs.py b/anton/core/tools/tool_defs.py index 2b0b182c..c37a24b5 100644 --- a/anton/core/tools/tool_defs.py +++ b/anton/core/tools/tool_defs.py @@ -93,6 +93,10 @@ class ToolDef: "type": "integer", "description": "Estimated execution time in seconds. Drives the total timeout (roughly 2x estimate). Use progress() for long cells.", }, + "confirm_new_scratchpad": { + "type": "boolean", + "description": "Set true only to deliberately create a SECOND scratchpad while one is already in use this task. Normally reuse one scratchpad name for the whole task — each name is a separate isolated environment, so a new one loses all existing state. Leave unset/false unless you truly need isolation.", + }, }, "required": ["action", "name"], }, diff --git a/anton/core/tools/tool_handlers.py b/anton/core/tools/tool_handlers.py index c23ca94e..6c8625a2 100644 --- a/anton/core/tools/tool_handlers.py +++ b/anton/core/tools/tool_handlers.py @@ -4,7 +4,11 @@ from typing import TYPE_CHECKING from anton.core.backends.base import Cell -from anton.core.utils.scratchpad import prepare_scratchpad_exec, format_cell_result +from anton.core.utils.scratchpad import ( + prepare_scratchpad_exec, + format_cell_result, + observe_scratchpad_cell, +) if TYPE_CHECKING: from anton.chat_session import ChatSession @@ -408,25 +412,17 @@ def _acc_observe(kind: str, detail: dict, *, severity: int = 1) -> None: fn(kind, detail, severity=severity) if action == "exec": + # The single-scratchpad guard and the pre-execute ACC events + # (scratchpad_empty_code / scratchpad_call) live in + # prepare_scratchpad_exec — the SHARED entry point that the streaming + # path (ChatSession.turn_stream) also calls — so they fire on both + # paths. A str return is a message the call should not run past + # (empty code, single-scratchpad challenge, or install failure). result = await prepare_scratchpad_exec(session, tc_input) if isinstance(result, str): - # Empty / malformed code parameter — the dispatcher rejected - # it before reaching the runtime. This is exactly the - # "silent code-clip" failure mode the ACC's - # detect_oversized_cell watches for. - _acc_observe("scratchpad_empty_code", {"name": name}, severity=7) return result pad, code, description, estimated_time, estimated_seconds = result - _acc_observe( - "scratchpad_call", - { - "name": name, - "code_len": len(code or ""), - "one_line_description": description or "", - }, - ) - # Notify pre-execute observers (e.g. cerebellum). The runtime # never sees these — observation is an orchestration concern, # so it lives at the dispatcher layer where the data is most @@ -452,31 +448,9 @@ def _acc_observe(kind: str, detail: dict, *, severity: int = 1) -> None: pad_name=name, description=description, cell=cell, ) await _fire_post_execute(session, cell) - # ACC: distinguish "killed" (timeout/cancel/OOM) from a - # plain runtime error. The local backend sets cell.error - # to a string starting with "Cancelled" or matching the - # "Cell timed out"/"Cell killed" prefixes from the - # asyncio.TimeoutError path. Everything else (NameError, - # ImportError, …) is a regular result with success=False. - err = (cell.error or "").strip() - if err.startswith(("Cancelled", "Cell timed out", "Cell killed")): - _acc_observe( - "scratchpad_killed", - {"name": name, "reason": err[:120]}, - severity=6, - ) - else: - success = not err and not (cell.stderr or "").strip() - _acc_observe( - "scratchpad_result", - { - "name": name, - "success": success, - "stdout_len": len(cell.stdout or ""), - "error": err[:300] if err else "", - }, - severity=5 if not success else 1, - ) + # Post-execute ACC event (killed vs result) via the shared helper — + # the streaming path emits the same. + observe_scratchpad_cell(session, name, cell) return format_cell_result(cell) elif action == "view": diff --git a/anton/core/utils/scratchpad.py b/anton/core/utils/scratchpad.py index da518ff4..2cf15bc6 100644 --- a/anton/core/utils/scratchpad.py +++ b/anton/core/utils/scratchpad.py @@ -5,16 +5,107 @@ from anton.core.session import ChatSession +def _acc_observe(session, kind: str, detail: dict, *, severity: int = 1) -> None: + """Safe ACC emit — no-op if the session has no observer wired.""" + fn = getattr(session, "_acc_observe", None) + if fn is not None: + fn(kind, detail, severity=severity) + + +def observe_scratchpad_cell(session, name: str, cell) -> None: + """Emit the post-execute ACC event for a finished cell. + + Distinguishes a kill (timeout/cancel/OOM) from a plain runtime error so + detect_kill_loop sees `scratchpad_killed`. Shared by both exec paths — + `handle_scratchpad` (CLI `turn()`) and the inline streaming exec in + `ChatSession.turn_stream` — so the ACC instrumentation is identical + regardless of which path ran the cell. + """ + if cell is None: + return + err = (cell.error or "").strip() + if err.startswith(("Cancelled", "Cell timed out", "Cell killed")): + _acc_observe(session, "scratchpad_killed", {"name": name, "reason": err[:120]}, severity=6) + else: + success = not err and not (cell.stderr or "").strip() + _acc_observe( + session, + "scratchpad_result", + { + "name": name, + "success": success, + "stdout_len": len(cell.stdout or ""), + "error": err[:300] if err else "", + }, + severity=5 if not success else 1, + ) + + async def prepare_scratchpad_exec(session: ChatSession, tc_input: dict): """Validate and prepare a scratchpad exec call. Returns (pad, code, description, estimated_time, estimated_seconds) or - a str error message if validation fails. + a str message if the call should not run (empty code, a single-scratchpad + challenge, or a failed package install). + + This is the SHARED entry point for both exec paths — `handle_scratchpad` + (CLI) and the inline streaming exec in `ChatSession.turn_stream` (cowork) + both call it — so the single-scratchpad guard and the pre-execute ACC + events live here, not in `handle_scratchpad` (which the streaming path + bypasses). """ name = tc_input.get("name", "") code = tc_input.get("code", "") if not code or not code.strip(): - return "No code provided." + # An empty `code` on an exec call is almost never the model meaning + # to run nothing — it's the large-payload drop: an oversized `code` + # argument gets truncated to "" in transit. Returning a bare "no + # code" here used to read as a no-op, so the model would retry the + # same oversized cell. Make the failure self-correcting and ensure + # it reads as an error (note the word "failed") so the per-tool + # error streak in _apply_error_tracking counts it toward the + # circuit breaker instead of silently resetting. + _acc_observe(session, "scratchpad_empty_code", {"name": name}, severity=7) + return ( + "Scratchpad exec failed: the `code` argument was empty. This usually " + "means the code payload was too large and got truncated in transit. " + "Do NOT retry the same large cell — instead write the output to disk in " + "small append steps (open(path, 'a'), keep each cell's string under ~5KB), " + "or generate the content inside the cell rather than passing a big literal." + ) + + # Single-scratchpad guard: the agent should reuse ONE scratchpad per task. + # A new name spins up a separate, empty process — state from the existing + # pad isn't visible there — a common source of wasted rounds (re-import, + # re-fetch, shuffling state across pads). Challenge a new name when the + # agent already has a working scratchpad this session, unless it confirms + # it needs isolation. Tracked names are ones the agent has exec'd here — + # NOT session._scratchpads.pads, which also holds system-created pads + # (e.g. the artifact backend launcher's slug pad), which must never count + # against the agent. Challenge AT MOST ONCE per session: the challenge is + # not an error (it resets no streak), so re-challenging every new name + # could loop to the round cap with nothing to stop it; one firm nudge is + # the enforcement, then respect the model's choice. `is True` (not + # truthiness) so a MagicMock attr in tests doesn't read as "challenged". + seen = getattr(session, "_agent_scratchpad_names", None) + if not isinstance(seen, set): + seen = set() + session._agent_scratchpad_names = seen + confirm_new = bool(tc_input.get("confirm_new_scratchpad", False)) + challenged_before = getattr(session, "_scratchpad_challenged", False) is True + if name not in seen and seen and not confirm_new and not challenged_before: + session._scratchpad_challenged = True + existing = "', '".join(sorted(seen)) + return ( + f"You already have an active scratchpad ('{existing}') with live state " + f"(imports, variables, fetched data). Starting a new one named '{name}' " + "creates a SEPARATE, empty environment — nothing from the existing " + "scratchpad is available there, so you'd re-import and re-fetch. Reuse the " + "existing scratchpad for this task; it is stateful across cells. If you " + "genuinely need an isolated environment, call scratchpad exec again with " + "confirm_new_scratchpad=true." + ) + seen.add(name) pad = await session._scratchpads.get_or_create(name) @@ -34,6 +125,15 @@ async def prepare_scratchpad_exec(session: ChatSession, tc_input: dict): estimated_seconds = 0 estimated_time = f"{estimated_seconds}s" if estimated_seconds > 0 else "" + _acc_observe( + session, + "scratchpad_call", + { + "name": name, + "code_len": len(code or ""), + "one_line_description": description or "", + }, + ) return pad, code, description, estimated_time, estimated_seconds diff --git a/tests/e2e/scenarios/test_loop_safety.py b/tests/e2e/scenarios/test_loop_safety.py index 61d40deb..25fdf5a6 100644 --- a/tests/e2e/scenarios/test_loop_safety.py +++ b/tests/e2e/scenarios/test_loop_safety.py @@ -63,9 +63,12 @@ def test_session_exits_within_timeout(cfg, stub, tmp_path): @pytest.mark.stub_only def test_resilience_nudge_injected_after_two_errors(cfg, stub, tmp_path): + # Reuse ONE scratchpad name: a realistic retry loop is the same cell + # failing twice. (Distinct names would instead trip the single-scratchpad + # guard, which is exercised separately.) bad_code = "def oops(:\n pass" - stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad1", "code": bad_code}) - stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad2", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "bad", "code": bad_code}) stub.queue_text("NUDGE_RECEIVED") stub.queue_verification_ok() result = run_anton(["--folder", str(tmp_path)], ["do bad stuff", "exit"], @@ -82,9 +85,12 @@ def test_resilience_nudge_injected_after_two_errors(cfg, stub, tmp_path): @pytest.mark.stub_only def test_circuit_breaker_fires_after_five_consecutive_errors(cfg, stub, tmp_path): + # Reuse ONE scratchpad name so this exercises the consecutive-error + # circuit breaker, not the single-scratchpad guard (distinct names would + # trigger a guard challenge that resets the streak). bad_code = "def bad(:\n pass" for i in range(5): - stub.queue_tool_call("scratchpad", {"action": "exec", "name": f"err_{i}", "code": bad_code}) + stub.queue_tool_call("scratchpad", {"action": "exec", "name": "err", "code": bad_code}) stub.queue_text("ERRORS_EXHAUSTED") stub.queue_verification_ok() result = run_anton(["--folder", str(tmp_path)], ["break everything", "exit"], diff --git a/tests/test_acc.py b/tests/test_acc.py index 2ed7f114..448b2133 100644 --- a/tests/test_acc.py +++ b/tests/test_acc.py @@ -244,12 +244,16 @@ def test_fires_on_two_kills_same_name(self): assert lesson is not None assert lesson.detector == "detect_kill_loop" - def test_silent_when_kills_are_for_different_names(self): + def test_fires_on_kills_across_different_names(self): + # Renaming the scratchpad between failed attempts must NOT hide the + # loop — two kills in a turn fire regardless of name. events = [ Event("scratchpad_killed", 6, {"name": "a", "reason": "timeout"}, 1), Event("scratchpad_killed", 6, {"name": "b", "reason": "timeout"}, 2), ] - assert detect_kill_loop(events) is None + lesson = detect_kill_loop(events) + assert lesson is not None + assert lesson.detector == "detect_kill_loop" def test_silent_on_single_kill(self): events = [Event("scratchpad_killed", 6, {"name": "compute"}, 3)] diff --git a/tests/test_resilience_nudge.py b/tests/test_resilience_nudge.py new file mode 100644 index 00000000..7a62a959 --- /dev/null +++ b/tests/test_resilience_nudge.py @@ -0,0 +1,45 @@ +"""Tests for ChatSession._select_resilience_nudge — failure-type-aware nudging. + +The generic RESILIENCE_NUDGE is scrape/fetch advice and misdirects scratchpad +failures (a too-big or too-slow cell doesn't need a different data source). The +selector routes scratchpad size/timeout failures to specific guidance and keeps +the generic nudge for everything else. +""" + +from __future__ import annotations + +from anton.core.llm.prompts import ( + RESILIENCE_NUDGE, + SCRATCHPAD_SIZE_NUDGE, + SCRATCHPAD_TIMEOUT_NUDGE, +) +from anton.core.session import ChatSession + +_select = ChatSession._select_resilience_nudge + + +class TestSelectResilienceNudge: + def test_non_scratchpad_tool_gets_generic_nudge(self): + assert _select("web_fetch", "failed to fetch the page") == RESILIENCE_NUDGE + + def test_scratchpad_timeout_gets_timeout_nudge(self): + assert _select("scratchpad", "Cell timed out after 180s total") == SCRATCHPAD_TIMEOUT_NUDGE + + def test_scratchpad_inactivity_gets_timeout_nudge(self): + msg = "Cell killed after 60s of inactivity (no output or progress() calls)" + assert _select("scratchpad", msg) == SCRATCHPAD_TIMEOUT_NUDGE + + def test_scratchpad_empty_code_gets_size_nudge(self): + msg = "Scratchpad exec failed: the `code` argument was empty. ..." + assert _select("scratchpad", msg) == SCRATCHPAD_SIZE_NUDGE + + def test_scratchpad_generic_error_gets_generic_nudge(self): + # A NameError-style failure is neither size nor timeout; it still gets + # the generic "failed twice, change approach" nudge (only size/timeout + # get specialised scratchpad advice). + assert _select("scratchpad", "[error]\nNameError: name 'data' is not defined") == RESILIENCE_NUDGE + + def test_scratchpad_nudges_never_mention_scraping(self): + for nudge in (SCRATCHPAD_SIZE_NUDGE, SCRATCHPAD_TIMEOUT_NUDGE): + assert "archive.org" not in nudge + assert "data source" not in nudge diff --git a/tests/test_scratchpad.py b/tests/test_scratchpad.py index cd08d65a..ea9d580b 100644 --- a/tests/test_scratchpad.py +++ b/tests/test_scratchpad.py @@ -831,28 +831,53 @@ async def test_compute_timeouts_no_estimate(self): assert inactivity == 30.0 async def test_compute_timeouts_with_estimate(self): - """Estimate should scale total timeout and inactivity with no hard cap.""" + """Estimate scales the total with no cap; inactivity is clamped to cell_inactivity_max (default 60).""" from anton.core.backends.utils import compute_timeouts as _compute_timeouts # Small estimate: max(10*2, 10+30) = max(20, 40) = 40 total, inactivity = _compute_timeouts(10) assert total == 40.0 - assert inactivity == 30.0 # max(5, 30) = 30 + assert inactivity == 30.0 # max(5, 30) = 30, under the cap # Medium estimate: max(60*2, 60+30) = max(120, 90) = 120 total, inactivity = _compute_timeouts(60) assert total == 120.0 - assert inactivity == 30.0 # max(30, 30) = 30 + assert inactivity == 30.0 # max(30, 30) = 30, under the cap - # Large estimate: max(300*2, 300+30) = max(600, 330) = 600 + # Large estimate: total still scales, inactivity is capped at 60 total, inactivity = _compute_timeouts(300) assert total == 600.0 - assert inactivity == 150.0 # max(150, 30) = 150 + assert inactivity == 60.0 # min(max(150, 30), 60) = 60 - # Very large estimate: scales with estimate + # Very large estimate: total keeps scaling so long-but-active cells + # can run; the silence window stays capped. total, inactivity = _compute_timeouts(1000) assert total == 2000.0 - assert inactivity == 500.0 # max(500, 30) = 500 + assert inactivity == 60.0 # min(max(500, 30), 60) = 60 + + async def test_compute_timeouts_inactivity_cap_is_configurable(self): + """cell_inactivity_max bounds the silence window regardless of estimate.""" + from anton.core.backends import utils as _utils + from anton.core.settings import CoreSettings + + # est=300 would scale inactivity to 150s without the cap; with the + # default cap (60) it is clamped, and the cap is tunable via settings. + total, inactivity = _utils.compute_timeouts(300) + assert inactivity == float(CoreSettings().cell_inactivity_max) + assert total == 600.0 # total is intentionally left uncapped + + async def test_compute_timeouts_total_max_off_by_default(self): + """cell_total_max defaults to 0 — the total is uncapped out of the box.""" + from anton.core.settings import CoreSettings + assert CoreSettings().cell_total_max == 0 + + async def test_compute_timeouts_total_max_backstop(self, monkeypatch): + """When set, cell_total_max bounds the total; inactivity stays capped.""" + from anton.core.backends.utils import compute_timeouts as _compute_timeouts + monkeypatch.setenv("ANTON_CELL_TOTAL_MAX", "300") + total, inactivity = _compute_timeouts(1000) + assert total == 300.0 # min(2000, 300) + assert inactivity == 60.0 class TestSampleFunction: diff --git a/tests/test_scratchpad_observer_dispatch.py b/tests/test_scratchpad_observer_dispatch.py index 1a99c4a5..f80a2a13 100644 --- a/tests/test_scratchpad_observer_dispatch.py +++ b/tests/test_scratchpad_observer_dispatch.py @@ -23,6 +23,59 @@ _fire_pre_execute, handle_scratchpad, ) +from anton.core.utils.scratchpad import observe_scratchpad_cell + + +class _RecordingAccSession: + """Session stub that records ACC observations.""" + + def __init__(self): + self.events: list[tuple] = [] + + def _acc_observe(self, kind, detail, *, severity=1): + self.events.append((kind, detail, severity)) + + +class TestObserveScratchpadCell: + """observe_scratchpad_cell is the shared post-exec ACC emitter used by + BOTH the CLI (handle_scratchpad) and streaming (turn_stream) paths.""" + + def test_timeout_kill_emits_scratchpad_killed(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Cell timed out after 180s total. Process killed") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_killed" + assert s.events[0][1]["name"] == "dash" + + def test_inactivity_kill_emits_scratchpad_killed(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Cell killed after 60s of inactivity") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_killed" + + def test_runtime_error_emits_result_failure(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="", stderr="", error="Traceback...\nNameError: x") + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_result" + assert s.events[0][1]["success"] is False + + def test_success_emits_result_success(self): + s = _RecordingAccSession() + cell = Cell(code="x", stdout="42", stderr="", error=None) + observe_scratchpad_cell(s, "dash", cell) + assert s.events[0][0] == "scratchpad_result" + assert s.events[0][1]["success"] is True + + def test_none_cell_emits_nothing(self): + s = _RecordingAccSession() + observe_scratchpad_cell(s, "dash", None) + assert s.events == [] + + def test_no_acc_observer_is_noop(self): + # A session without _acc_observe (e.g. ACC off) must not raise. + observe_scratchpad_cell(SimpleNamespace(), "dash", + Cell(code="x", stdout="", stderr="", error=None)) # ───────────────────────────────────────────────────────────────────────────── @@ -288,3 +341,82 @@ async def test_non_exec_actions_do_not_fire_observers(self): assert obs.pre_calls == [] assert obs.post_calls == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Single-scratchpad guard — challenge a second distinct scratchpad per task +# ───────────────────────────────────────────────────────────────────────────── + +_CHALLENGE_MARK = "confirm_new_scratchpad=true" + + +class TestSingleScratchpadGuard: + def _exec(self, name: str, **extra: object) -> dict: + tc = { + "action": "exec", + "name": name, + "code": "print(1)", + "one_line_description": "do a thing", + "estimated_execution_time_seconds": 5, + } + tc.update(extra) + return tc + + @pytest.mark.asyncio + async def test_first_scratchpad_not_challenged(self): + session, _ = _fake_session() + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result + assert "dash" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_reusing_same_name_not_challenged(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result + + @pytest.mark.asyncio + async def test_second_distinct_name_is_challenged(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad(session, self._exec("report")) + assert _CHALLENGE_MARK in result + # The challenged name must NOT be recorded, so a later confirm works. + assert "report" not in session._agent_scratchpad_names + # A challenge is not a failure — it must not contain an error marker + # that would trip the per-tool circuit breaker. + assert "failed" not in result and "[error]" not in result + + @pytest.mark.asyncio + async def test_confirm_allows_second_scratchpad(self): + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + result = await handle_scratchpad( + session, self._exec("report", confirm_new_scratchpad=True) + ) + assert _CHALLENGE_MARK not in result + assert "report" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_challenge_fires_at_most_once_per_session(self): + # The challenge must not be able to induce its own loop: a model that + # keeps requesting new names without confirming is nudged once, then + # allowed (the challenge isn't an error, so nothing else would stop it). + session, _ = _fake_session() + session._agent_scratchpad_names = {"dash"} + first = await handle_scratchpad(session, self._exec("report")) + assert _CHALLENGE_MARK in first + second = await handle_scratchpad(session, self._exec("report2")) + assert _CHALLENGE_MARK not in second + assert "report2" in session._agent_scratchpad_names + + @pytest.mark.asyncio + async def test_system_pads_do_not_count_against_agent(self): + # A system-created pad (e.g. the artifact backend launcher's slug pad) + # lives in _scratchpads.pads but never in _agent_scratchpad_names, so + # the agent's first real scratchpad is not challenged by its presence. + session, _ = _fake_session() + session._scratchpads.pads = {"my-artifact-slug": MagicMock()} + result = await handle_scratchpad(session, self._exec("dash")) + assert _CHALLENGE_MARK not in result From 50c24b84985f84f7ed9283cd72c9d7e2958170c5 Mon Sep 17 00:00:00 2001 From: Jorge Torres Date: Thu, 18 Jun 2026 14:23:46 -0700 Subject: [PATCH 5/9] =?UTF-8?q?prompt:=20'act=5Ffirst'=20flag=20=E2=80=94?= =?UTF-8?q?=20do=20first,=20surface=20assumptions,=20ask=20later?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an act_first config flag (AntonSettings.act_first / ChatSessionConfig.act_first, default True) that selects the conversation-discipline posture: • act_first=True → bias toward action; act on reasonable defaults and STATE each assumption inline as it's made so the user can redirect mid-flight; only stop to ask when a wrong guess is costly/irreversible or unknowable. • act_first=False → the previous cautious ask-first discipline. prompts.py exposes both blocks (CONVERSATION_DISCIPLINE_ACT_FIRST/ASK_FIRST) and a {conversation_discipline} slot; the builder picks one from the flag. Wired through session + the chat_session/chat/runtime entry points. Co-Authored-By: Claude Opus 4.8 (1M context) --- anton/chat.py | 1 + anton/chat_session.py | 1 + anton/config/settings.py | 4 +++ anton/core/llm/prompt_builder.py | 9 ++++++ anton/core/llm/prompts.py | 48 ++++++++++++++++++++++++++------ anton/core/runtime.py | 1 + anton/core/session.py | 6 ++++ 7 files changed, 61 insertions(+), 9 deletions(-) diff --git a/anton/chat.py b/anton/chat.py index 5ce72fd7..9cd2ef67 100644 --- a/anton/chat.py +++ b/anton/chat.py @@ -1271,6 +1271,7 @@ async def _chat_loop( history_store=history_store, session_id=current_session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, output_dir=settings.artifacts_dir, tools=[CONNECT_DATASOURCE_TOOL, PUBLISH_TOOL], web_search_enabled=settings.web_search_enabled, diff --git a/anton/chat_session.py b/anton/chat_session.py index 0f7ac11a..200bd641 100644 --- a/anton/chat_session.py +++ b/anton/chat_session.py @@ -116,6 +116,7 @@ def rebuild_session( history_store=history_store, session_id=session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, output_dir=settings.artifacts_dir, web_search_enabled=settings.web_search_enabled, web_fetch_enabled=settings.web_fetch_enabled, diff --git a/anton/config/settings.py b/anton/config/settings.py index 31257809..fc930403 100644 --- a/anton/config/settings.py +++ b/anton/config/settings.py @@ -85,6 +85,10 @@ class AntonSettings(CoreSettings): proactive_dashboards: bool = False # when True, build HTML dashboards; when False, CLI output only + # "Do first, ask later": act on reasonable defaults and surface assumptions + # inline instead of stopping to ask. False = cautious ask-first discipline. + act_first: bool = True + theme: str = "auto" disable_autoupdates: bool = False diff --git a/anton/core/llm/prompt_builder.py b/anton/core/llm/prompt_builder.py index 9d50a80c..232cf526 100644 --- a/anton/core/llm/prompt_builder.py +++ b/anton/core/llm/prompt_builder.py @@ -8,6 +8,8 @@ BASE_VISUALIZATIONS_PROMPT, BACKEND_GENERATION_PROMPT, CHAT_SYSTEM_PROMPT, + CONVERSATION_DISCIPLINE_ACT_FIRST, + CONVERSATION_DISCIPLINE_ASK_FIRST, VISUALIZATIONS_MARKDOWN_OUTPUT_FORMAT_PROMPT, VISUALIZATIONS_HTML_OUTPUT_FORMAT_PROMPT, ) @@ -128,6 +130,7 @@ def build( system_prompt_context: SystemPromptContext, proactive_dashboards: bool, output_dir: str, + act_first: bool = True, tool_defs: list["ToolDef"] | None = None, memory_context: str = "", project_context: str = "", @@ -146,10 +149,16 @@ def build( if prefix: prompt += f"{prefix}\n\n" + conversation_discipline = ( + CONVERSATION_DISCIPLINE_ACT_FIRST if act_first + else CONVERSATION_DISCIPLINE_ASK_FIRST + ) + prompt += CHAT_SYSTEM_PROMPT.format( runtime_context=system_prompt_context.runtime_context, artifacts_section=ARTIFACTS_PROMPT, visualizations_section=visualizations_section, + conversation_discipline=conversation_discipline, current_datetime=current_datetime, ) diff --git a/anton/core/llm/prompts.py b/anton/core/llm/prompts.py index 245a48c1..e671e28d 100644 --- a/anton/core/llm/prompts.py +++ b/anton/core/llm/prompts.py @@ -160,15 +160,7 @@ {visualizations_section} -CONVERSATION DISCIPLINE (critical): -- If you ask the user a question, STOP and WAIT for their reply. Never ask a question \ -and then act in the same turn — that skips the user's answer. -- Only act when you have ALL the information you need. If you're unsure \ -about anything, ask first, then act in a LATER turn after receiving the answer. -- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ -it in context of what you just asked. Do not ask them to repeat themselves. -- Gather requirements incrementally through conversation. Do not front-load every \ -possible question at once — ask 1-3 at a time, then follow up. +{conversation_discipline} RUNTIME IDENTITY: {runtime_context} @@ -210,6 +202,44 @@ Only encode genuinely reusable knowledge — not transient conversation details. """ +# --------------------------------------------------------------------------- +# Conversation discipline — two postures, selected by the `act_first` flag +# (ChatSessionConfig.act_first → AntonSettings.act_first; default True). +# Injected into CHAT_SYSTEM_PROMPT via {conversation_discipline}. +# --------------------------------------------------------------------------- +CONVERSATION_DISCIPLINE_ACT_FIRST = """CONVERSATION DISCIPLINE (critical): +- Bias toward ACTION. When a request has a reasonable default interpretation, act on it \ +now — do not stall the task with a clarifying question. A delivered result the user can \ +correct beats a question that makes them wait. +- STATE YOUR ASSUMPTIONS AS YOU MAKE THEM. Whenever you proceed on an assumption — a \ +default value, an interpretation of a vague request, a chosen approach, or a scope you \ +picked — say so plainly in the SAME response, right as you act, not buried at the end. \ +Phrase it like "Assuming you mean X (the common case), so I'll…" or "Going with monthly \ +granularity since you didn't specify." Surface each assumption as it happens so the user \ +can redirect mid-flight instead of being blocked up front. Acting silently is wrong; \ +acting out loud with your assumptions visible is right. +- Only STOP and ASK when acting on a guess would be costly to undo or is genuinely \ +unknowable: destructive or irreversible actions (deleting data, spending money, sending \ +messages on the user's behalf), credentials or access you can't obtain, or a fork where \ +the options lead to materially different results and you have no basis to choose. Then ask \ +ONE tight question — and when you ask, STOP and WAIT for the reply; never ask and act in \ +the same turn, that skips their answer. +- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ +it in context of what you just asked. Do not ask them to repeat themselves. +- Don't front-load a questionnaire. Prefer acting on sensible defaults (stated out loud) \ +over interrogating the user; if something truly gates the work, ask at most 1-2 things.""" + +CONVERSATION_DISCIPLINE_ASK_FIRST = """CONVERSATION DISCIPLINE (critical): +- If you ask the user a question, STOP and WAIT for their reply. Never ask a question \ +and then act in the same turn — that skips the user's answer. +- Only act when you have ALL the information you need. If you're unsure \ +about anything, ask first, then act in a LATER turn after receiving the answer. +- When the user gives a vague answer (like "yeah", "the current one", "sure"), interpret \ +it in context of what you just asked. Do not ask them to repeat themselves. +- Gather requirements incrementally through conversation. Do not front-load every \ +possible question at once — ask 1-3 at a time, then follow up.""" + + # --------------------------------------------------------------------------- # Artifact contract — universal entry point for any user-facing output # --------------------------------------------------------------------------- diff --git a/anton/core/runtime.py b/anton/core/runtime.py index 5f10f510..82783506 100644 --- a/anton/core/runtime.py +++ b/anton/core/runtime.py @@ -185,6 +185,7 @@ async def build_chat_session( history_store=history_store, session_id=session_id, proactive_dashboards=settings.proactive_dashboards, + act_first=settings.act_first, tools=list(extra_tools) if extra_tools else [], ) return ChatSession(config) diff --git a/anton/core/session.py b/anton/core/session.py index 58abf956..62f19e61 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -112,6 +112,10 @@ class ChatSessionConfig: # host didn't identify itself. harness: str | None = None proactive_dashboards: bool = False + # When True (default), Anton acts on reasonable defaults and surfaces its + # assumptions inline instead of stopping to ask ("do first, ask later"). + # When False, it falls back to the cautious ask-first discipline. + act_first: bool = True tools: list[ToolDef] = field(default_factory=list) output_dir: str = ".anton/output" # Web tools — on by default. Each is independently resolved at session @@ -145,6 +149,7 @@ def __init__(self, config: ChatSessionConfig) -> None: self._system_prompt_context = config.system_prompt_context self._output_dir = config.output_dir self._proactive_dashboards = config.proactive_dashboards + self._act_first = config.act_first self._extra_tools = config.tools self._workspace = config.workspace self._data_vault = config.data_vault @@ -565,6 +570,7 @@ async def _build_system_prompt(self, user_message: str = "") -> str: current_datetime=_current_datetime, system_prompt_context=self._system_prompt_context, proactive_dashboards=self._proactive_dashboards, + act_first=self._act_first, output_dir=self._output_dir, tool_defs=self.tool_registry.get_tool_defs(), memory_context=memory_section, From aed3456386c9de23f7648ad0eb075db0b3b9d647 Mon Sep 17 00:00:00 2001 From: Jorge Torres Date: Thu, 18 Jun 2026 15:45:53 -0700 Subject: [PATCH 6/9] prompt: add two execution-discipline rules - if a scratchpad cell errors the same way twice, change strategy (don't re-run the same code) - validate output before claiming a task is done; report what was verified Co-Authored-By: Claude Opus 4.8 (1M context) --- anton/core/llm/prompts.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/anton/core/llm/prompts.py b/anton/core/llm/prompts.py index e671e28d..3ae96a9c 100644 --- a/anton/core/llm/prompts.py +++ b/anton/core/llm/prompts.py @@ -177,6 +177,8 @@ different data sources for the same information, caching/retrying with backoff, etc. - Exhaust at least 2-3 genuinely different approaches before involving the user. Each \ attempt should be a meaningfully different strategy — not just retrying the same thing. +- If a scratchpad cell errors the same way twice, change strategy — don't re-run the \ +same code expecting a different result. - Only ask the user for things that truly require them: credentials they haven't shared, \ ambiguous requirements you can't infer, access to private/internal systems, or a choice \ between equally valid options. @@ -184,6 +186,9 @@ so the user has full context and doesn't suggest things you've already done. GENERAL RULES: +- Validate your output before claiming the task is done — actually check the result \ +(inspect the data, run it, confirm the file/artifact exists and looks right) instead of \ +assuming it worked. Report what you verified, not what you intended. - Be conversational, concise, and direct. No filler. No bullet-point dumps unless asked. - Respond naturally to greetings, small talk, and follow-up questions. - When describing yourself, focus on problem-solving and collaboration — not listing \ From 14e12ef2274521d641929cf840097ba3e2d420ec Mon Sep 17 00:00:00 2001 From: Jorge Torres Date: Thu, 18 Jun 2026 16:03:55 -0700 Subject: [PATCH 7/9] =?UTF-8?q?prompt:=20cache-stable=20assembly=20(2a)=20?= =?UTF-8?q?=E2=80=94=20task-anchored=20date=20+=20memory=20at=20tail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the system-prompt prefix byte-stable across a task's turns so providers can prefix-cache it (and so behavior is deterministic): - date is task-anchored + date-only (ChatSessionConfig.clock, e.g. the conversation's created_at) instead of a per-turn minute clock; - the relevance-filtered memory snapshot moves to the very end (volatile tail) so it never invalidates the stable content above it. Stacked on the act_first branch. Co-Authored-By: Claude Opus 4.8 (1M context) --- anton/core/llm/prompt_builder.py | 11 +++++++++-- anton/core/session.py | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/anton/core/llm/prompt_builder.py b/anton/core/llm/prompt_builder.py index 232cf526..69535849 100644 --- a/anton/core/llm/prompt_builder.py +++ b/anton/core/llm/prompt_builder.py @@ -168,8 +168,8 @@ def build( if tool_prompts: prompt += tool_prompts - if memory_context: - prompt += memory_context + # Stable, per-session content goes before the volatile tail so the + # prefix stays cache-stable across turns. if project_context: prompt += project_context if self_awareness_context: @@ -185,6 +185,13 @@ def build( if suffix: prompt += f"\n\n{suffix}" + # Volatile tail — LAST so everything above can be cached. The memory + # snapshot is relevance-filtered per user message, so it changes every + # turn; keeping it at the very end means it never invalidates the + # cacheable prefix above it. + if memory_context: + prompt += memory_context + return prompt diff --git a/anton/core/session.py b/anton/core/session.py index 62f19e61..7bac408e 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -3,6 +3,7 @@ import asyncio from collections.abc import AsyncIterator, Callable from dataclasses import asdict, dataclass, field +from datetime import datetime import json import re from typing import TYPE_CHECKING, List @@ -124,6 +125,12 @@ class ChatSessionConfig: # (registered on the tool registry). See ChatSession.__init__. web_search_enabled: bool = True web_fetch_enabled: bool = True + # Stable "as of" timestamp for the system prompt. The host passes the + # task's anchor (e.g. the conversation's created_at) so the date is + # byte-identical across every turn of a task — keeping the system-prompt + # prefix cacheable (a per-turn wall clock would bust the cache each turn). + # None → fall back to today's date. + clock: datetime | None = None class ChatSession: @@ -150,6 +157,7 @@ def __init__(self, config: ChatSessionConfig) -> None: self._output_dir = config.output_dir self._proactive_dashboards = config.proactive_dashboards self._act_first = config.act_first + self._clock = config.clock self._extra_tools = config.tools self._workspace = config.workspace self._data_vault = config.data_vault @@ -541,8 +549,13 @@ def _record_cell_explainability( async def _build_system_prompt(self, user_message: str = "") -> str: import datetime as _dt - _now = _dt.datetime.now() - _current_datetime = _now.strftime("%A, %B %d, %Y at %I:%M %p") + # Task-anchored, date-only stamp. Using the task's anchor (self._clock, + # e.g. the conversation's created_at) keeps this byte-identical across + # every turn so the system-prompt prefix stays cache-stable; a per-turn + # wall clock with minute precision would invalidate the cache each turn. + # The agent fetches precise time via a tool when it actually needs it. + _now = self._clock or _dt.datetime.now() + _current_datetime = _now.strftime("%A, %B %d, %Y") # Inject memory context (replaces old self_awareness) memory_section = "" From 3bcaaf229441cbb7d1d712faf0cf640603293f32 Mon Sep 17 00:00:00 2001 From: Jorge Torres Date: Thu, 18 Jun 2026 20:40:44 -0700 Subject: [PATCH 8/9] compaction: structured, reference-only, in-place-updated summary (3b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework _summarize_history: - 3b-light: frame the summary as REFERENCE ONLY (latest user message wins; don't resume superseded/cancelled work) — protects Anton's auto-continue verifier from resurrecting stale tasks after a compaction. - 3b-full: emit a structured STATE RECORD (Goal/Constraints/Completed/Active state/Blocked/Decisions/Remaining) instead of freeform bullets, and UPDATE a prior summary in place (via a sentinel marker) rather than summarizing a summary, so 'Remaining' work survives across compactions. Stacked on the cache-stable-prompt chain. Co-Authored-By: Claude Opus 4.8 (1M context) --- anton/core/session.py | 69 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/anton/core/session.py b/anton/core/session.py index 7bac408e..caa87ae1 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -60,6 +60,11 @@ from anton.core.settings import CoreSettings +# Sentinel prefixing a compacted-history summary so later compactions can +# recognize and update it in place rather than summarize a summary. +_COMPACTED_MARKER = "[COMPACTED CONTEXT — REFERENCE ONLY]" + + if TYPE_CHECKING: from rich.console import Console from anton.context.self_awareness import SelfAwarenessContext @@ -782,12 +787,18 @@ async def _summarize_history(self) -> None: old_turns = self._history[:split] recent_turns = self._history[split:] - # Serialize old turns into text for summarization + # Serialize old turns. Pull out any prior compacted summary so we + # UPDATE it in place rather than summarize a summary (which compounds + # loss every compaction). + prior_summary = "" lines: list[str] = [] for msg in old_turns: role = msg.get("role", "unknown") content = msg.get("content", "") if isinstance(content, str): + if content.lstrip().startswith(_COMPACTED_MARKER): + prior_summary = content + continue lines.append(f"[{role}]: {content[:2000]}") elif isinstance(content, list): for block in content: @@ -808,17 +819,40 @@ async def _summarize_history(self) -> None: if len(old_text) > 8000: old_text = old_text[:8000] + "\n... (truncated)" + if prior_summary: + user_content = ( + "PREVIOUS SUMMARY (update this in place — merge the new turns into it, " + "don't restate it verbatim):\n" + f"{prior_summary}\n\n" + "NEW TURNS TO FOLD IN:\n" + f"{old_text}" + ) + else: + user_content = old_text + try: + # 3b-full: a structured, in-place-updated STATE RECORD rather than a + # freeform blob — so "Remaining" work survives compaction instead of + # being flattened into prose. summary_response = await self._llm.code( system=( - "Summarize this conversation history concisely. Preserve:\n" - "- Key decisions and conclusions\n" - "- Important data/results discovered\n" - "- Variable names and values that are still relevant\n" - "- Errors encountered and how they were resolved\n" - "Keep it under 2000 tokens. Use bullet points." + "You compact an agent's earlier conversation into a terse, factual " + "STATE RECORD (not prose). Output only these sections, omitting any " + "that are empty:\n" + "## Goal — what the user ultimately wants\n" + "## Constraints — explicit requirements / preferences / do-nots\n" + "## Completed — work already done, each as `action → outcome`\n" + "## Active state — variables, data, files/artifacts in play and their " + "current values or paths\n" + "## Blocked — anything stuck and why\n" + "## Decisions — choices made and the reason\n" + "## Remaining — what is still left to do\n\n" + "If a PREVIOUS SUMMARY is provided, update it with the new turns " + "instead of starting over. If the user changed direction, narrowed " + "scope, or cancelled something, reflect that — drop superseded items " + "from Remaining, don't keep them. Keep it under ~2000 tokens." ), - messages=[{"role": "user", "content": old_text}], + messages=[{"role": "user", "content": user_content}], max_tokens=2048, ) summary = summary_response.content or "(summary unavailable)" @@ -826,17 +860,26 @@ async def _summarize_history(self) -> None: # If summarization fails, just do a simple truncation summary = f"(Earlier conversation with {len(old_turns)} turns — summarization failed)" - summary_msg = { - "role": "user", - "content": f"[Context summary of earlier conversation]\n{summary}", - } + # 3b-light: reference-only framing so the model treats this as compacted + # history, not a fresh instruction, and never resumes superseded/cancelled + # work after a compaction (which Anton's auto-continue verifier would + # otherwise be nudged to do). + summary_body = ( + f"{_COMPACTED_MARKER}\n" + "Compacted record of earlier conversation, for REFERENCE ONLY — not a new " + "request. The most recent user message takes priority; if the user changed " + "direction, narrowed scope, or cancelled something, follow that and do NOT " + "resume superseded work described below.\n\n" + f"{summary}" + ) + summary_msg = {"role": "user", "content": summary_body} # If the recent portion starts with a user message, insert a minimal # assistant separator to avoid consecutive user messages (API error). if recent_turns and recent_turns[0].get("role") == "user": self._history = [ summary_msg, - {"role": "assistant", "content": "Understood."}, + {"role": "assistant", "content": "Understood — using that as reference."}, *recent_turns, ] else: From 84b2dd1c30a52494330803f687de2e45ec3ba636 Mon Sep 17 00:00:00 2001 From: Jorge Torres Date: Fri, 19 Jun 2026 16:41:34 -0700 Subject: [PATCH 9/9] prompt: split 'started' (cached) from live 'now' (volatile tail) + summary keeps dates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR feedback: anchoring the date to created_at but labeling it 'current' meant a conversation resumed days/weeks later reported the wrong current time. Fix: - prefix carries a FIXED 'Conversation started: {date}' line (cache-stable); - the real wall clock is emitted in the volatile tail ('Current date and time: …'), recomputed each turn, so it's always accurate and never busts the cached prefix; - rename ChatSessionConfig.clock → started_at to match; - the 3b summarizer now preserves key event dates so the timeline survives compaction (per-message timestamps come from the harness embedding each message's created_at). Co-Authored-By: Claude Opus 4.8 (1M context) --- anton/core/llm/prompt_builder.py | 16 +++++++++----- anton/core/llm/prompts.py | 2 +- anton/core/session.py | 36 +++++++++++++++++++------------- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/anton/core/llm/prompt_builder.py b/anton/core/llm/prompt_builder.py index 69535849..9a4f0c0e 100644 --- a/anton/core/llm/prompt_builder.py +++ b/anton/core/llm/prompt_builder.py @@ -126,6 +126,7 @@ def _build_visualizations_section( def build( self, *, + conversation_started: str, current_datetime: str, system_prompt_context: SystemPromptContext, proactive_dashboards: bool, @@ -159,7 +160,7 @@ def build( artifacts_section=ARTIFACTS_PROMPT, visualizations_section=visualizations_section, conversation_discipline=conversation_discipline, - current_datetime=current_datetime, + conversation_started=conversation_started, ) prompt += "\n\n" + BACKEND_GENERATION_PROMPT.format(output_dir=output_dir) @@ -185,10 +186,15 @@ def build( if suffix: prompt += f"\n\n{suffix}" - # Volatile tail — LAST so everything above can be cached. The memory - # snapshot is relevance-filtered per user message, so it changes every - # turn; keeping it at the very end means it never invalidates the - # cacheable prefix above it. + # Volatile tail — LAST so everything above can be cached. The live + # clock and the relevance-filtered memory snapshot both change every + # turn, so they sit after the cache-stable prefix and never invalidate + # it. (The prefix carries only the fixed "conversation started" stamp.) + prompt += ( + f"\n\nCurrent date and time: {current_datetime}\n" + "(Earlier messages are prefixed with the time they were sent; that " + "bracketed timestamp is metadata, not part of the message text.)" + ) if memory_context: prompt += memory_context diff --git a/anton/core/llm/prompts.py b/anton/core/llm/prompts.py index 3ae96a9c..558ac2cd 100644 --- a/anton/core/llm/prompts.py +++ b/anton/core/llm/prompts.py @@ -7,7 +7,7 @@ solve problems. You are NOT a code assistant or chatbot. You are a coworker with a \ computer, and you use that computer to get things done. -Current date and time: {current_datetime} +Conversation started: {conversation_started} WHO YOU ARE: - You solve problems — not just write code. If someone needs emails classified, data \ diff --git a/anton/core/session.py b/anton/core/session.py index caa87ae1..2074b53b 100644 --- a/anton/core/session.py +++ b/anton/core/session.py @@ -130,12 +130,13 @@ class ChatSessionConfig: # (registered on the tool registry). See ChatSession.__init__. web_search_enabled: bool = True web_fetch_enabled: bool = True - # Stable "as of" timestamp for the system prompt. The host passes the - # task's anchor (e.g. the conversation's created_at) so the date is - # byte-identical across every turn of a task — keeping the system-prompt - # prefix cacheable (a per-turn wall clock would bust the cache each turn). - # None → fall back to today's date. - clock: datetime | None = None + # When the task (conversation) was created. Rendered as a fixed + # "Conversation started: …" line in the cache-stable prompt prefix — it + # never changes across turns, so it doesn't bust the prefix cache. The + # LIVE current time goes in the volatile tail instead (see _build_system_prompt), + # so resuming a conversation days later still reports the real "now". + # None → fall back to today. + started_at: datetime | None = None class ChatSession: @@ -162,7 +163,7 @@ def __init__(self, config: ChatSessionConfig) -> None: self._output_dir = config.output_dir self._proactive_dashboards = config.proactive_dashboards self._act_first = config.act_first - self._clock = config.clock + self._started_at = config.started_at self._extra_tools = config.tools self._workspace = config.workspace self._data_vault = config.data_vault @@ -554,13 +555,16 @@ def _record_cell_explainability( async def _build_system_prompt(self, user_message: str = "") -> str: import datetime as _dt - # Task-anchored, date-only stamp. Using the task's anchor (self._clock, - # e.g. the conversation's created_at) keeps this byte-identical across - # every turn so the system-prompt prefix stays cache-stable; a per-turn - # wall clock with minute precision would invalidate the cache each turn. - # The agent fetches precise time via a tool when it actually needs it. - _now = self._clock or _dt.datetime.now() - _current_datetime = _now.strftime("%A, %B %d, %Y") + # Two stamps, deliberately split for cache-stability AND correctness: + # • conversation_started — the task's creation time (self._started_at), + # a FIXED fact rendered in the cache-stable prefix; identical every + # turn so it never busts the prefix cache. + # • current_datetime — the real wall clock, rendered in the VOLATILE + # tail (after the cached prefix) so it's always accurate even when a + # conversation is resumed days/weeks later, without touching the cache. + _started = self._started_at or _dt.datetime.now() + _conversation_started = _started.strftime("%A, %B %d, %Y") + _current_datetime = _dt.datetime.now().strftime("%A, %B %d, %Y at %I:%M %p") # Inject memory context (replaces old self_awareness) memory_section = "" @@ -585,6 +589,7 @@ async def _build_system_prompt(self, user_message: str = "") -> str: prompt_builder = ChatSystemPromptBuilder() prompt = prompt_builder.build( + conversation_started=_conversation_started, current_datetime=_current_datetime, system_prompt_context=self._system_prompt_context, proactive_dashboards=self._proactive_dashboards, @@ -847,6 +852,9 @@ async def _summarize_history(self) -> None: "## Blocked — anything stuck and why\n" "## Decisions — choices made and the reason\n" "## Remaining — what is still left to do\n\n" + "Preserve the date/time of key events when it matters (e.g. " + "`Completed (2026-06-05): …`) — the raw per-message timestamps are " + "gone after compaction, so keep the ones that anchor the timeline.\n" "If a PREVIOUS SUMMARY is provided, update it with the new turns " "instead of starting over. If the user changed direction, narrowed " "scope, or cancelled something, reflect that — drop superseded items "