diff --git a/.agents/.backup-pre-008-1777287106694-2334/oma-config.yaml b/.agents/.backup-pre-008-1777287106694-2334/oma-config.yaml new file mode 100644 index 0000000..e23d0cd --- /dev/null +++ b/.agents/.backup-pre-008-1777287106694-2334/oma-config.yaml @@ -0,0 +1,24 @@ +# User Preferences (Optional) +# Project-specific user configuration +# +# This file is optional. Works with defaults if not present. +# CLI priority: --vendor arg > agent_cli_mapping > default_cli > cli-config.yaml's active_vendor > gemini + +# Response language setting (ko, en, ja, zh, ...) +language: ko + +# Date/time format +date_format: ISO +timezone: Asia/Seoul + +# Default CLI (for single tasks) +default_cli: gemini + +# Per-agent CLI mapping (multi-CLI mode) +agent_cli_mapping: + frontend: gemini + backend: gemini + mobile: gemini + qa: gemini + debug: gemini + pm: gemini diff --git a/.agents/agents/architecture-reviewer.md b/.agents/agents/architecture-reviewer.md index 9ffde1a..0c7bed6 100644 --- a/.agents/agents/architecture-reviewer.md +++ b/.agents/agents/architecture-reviewer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-architecture.md` (orchestrated: `result-architecture-{sessionId}.md`) - Include: status, recommendation summary, tradeoffs, risks, validation steps, artifacts created + + ## Charter Preflight (MANDATORY) Before ANY recommendations or structural edits, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT change architecture or code + ## Rules diff --git a/.agents/agents/backend-engineer.md b/.agents/agents/backend-engineer.md index 49def07..4fec262 100644 --- a/.agents/agents/backend-engineer.md +++ b/.agents/agents/backend-engineer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-backend.md` (orchestrated: `result-backend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code + ## Architecture diff --git a/.agents/agents/db-engineer.md b/.agents/agents/db-engineer.md index eabbaa1..d7b36af 100644 --- a/.agents/agents/db-engineer.md +++ b/.agents/agents/db-engineer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-db.md` (orchestrated: `result-db-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` + ## Rules diff --git a/.agents/agents/debug-investigator.md b/.agents/agents/debug-investigator.md index 8c14b04..538f260 100644 --- a/.agents/agents/debug-investigator.md +++ b/.agents/agents/debug-investigator.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-debug.md` (orchestrated: `result-debug-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code + ## Diagnosis Process diff --git a/.agents/agents/frontend-engineer.md b/.agents/agents/frontend-engineer.md index d0c8ae2..ba0e3cf 100644 --- a/.agents/agents/frontend-engineer.md +++ b/.agents/agents/frontend-engineer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-frontend.md` (orchestrated: `result-frontend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` + ## Architecture diff --git a/.agents/agents/mobile-engineer.md b/.agents/agents/mobile-engineer.md index eabdff8..dce9e50 100644 --- a/.agents/agents/mobile-engineer.md +++ b/.agents/agents/mobile-engineer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-mobile.md` (orchestrated: `result-mobile-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` + ## Architecture diff --git a/.agents/agents/pm-planner.md b/.agents/agents/pm-planner.md index ebf3147..a37b5a2 100644 --- a/.agents/agents/pm-planner.md +++ b/.agents/agents/pm-planner.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-pm.md` (orchestrated: `result-pm-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before ANY planning work, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT proceed + ## Planning Process diff --git a/.agents/agents/qa-reviewer.md b/.agents/agents/qa-reviewer.md index 7041076..70404e8 100644 --- a/.agents/agents/qa-reviewer.md +++ b/.agents/agents/qa-reviewer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-qa.md` (orchestrated: `result-qa-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist + + ## Charter Preflight (MANDATORY) Before starting review, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Must NOT do: modify source code, skip severity levels, report unverified findings - Success criteria: {all files reviewed, findings with file:line references} ``` + ## Review Priority Order diff --git a/.agents/agents/tf-infra-engineer.md b/.agents/agents/tf-infra-engineer.md index 1c99e1c..88734a2 100644 --- a/.agents/agents/tf-infra-engineer.md +++ b/.agents/agents/tf-infra-engineer.md @@ -13,6 +13,8 @@ Follow the vendor-specific execution protocol: - Write results to project root `.agents/results/result-tf-infra.md` (orchestrated: `result-tf-infra-{sessionId}.md`) - Include: status, summary, files changed, validation results, plan/apply notes, acceptance checklist + + ## Charter Preflight (MANDATORY) Before ANY infrastructure changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT apply destructive changes + ## Rules diff --git a/.agents/agents/variants/agent-variant.schema.json b/.agents/agents/variants/agent-variant.schema.json index 39b8ecb..224c1f3 100644 --- a/.agents/agents/variants/agent-variant.schema.json +++ b/.agents/agents/variants/agent-variant.schema.json @@ -3,7 +3,14 @@ "title": "Agent Variant Configuration", "description": "Configuration for vendor-specific agent generation from core agent prompts", "type": "object", - "required": ["vendor", "destDir", "modelDefault", "toolsDefault", "protocolPath", "agents"], + "required": [ + "vendor", + "destDir", + "modelDefault", + "toolsDefault", + "protocolPath", + "agents" + ], "properties": { "$schema": { "type": "string" diff --git a/.agents/agents/variants/gemini.json b/.agents/agents/variants/gemini.json index 382cb1a..0ce20f3 100644 --- a/.agents/agents/variants/gemini.json +++ b/.agents/agents/variants/gemini.json @@ -2,7 +2,7 @@ "$schema": "./agent-variant.schema.json", "vendor": "gemini", "destDir": ".gemini/agents", - "modelDefault": "gemini-3-flash-preview", + "modelDefault": "gemini-3-flash", "toolsDefault": ["bash", "glob", "grep", "read", "edit", "write", "ask"], "protocolPath": ".agents/skills/_shared/runtime/execution-protocols/gemini.md", "agents": { diff --git a/.agents/config/defaults.yaml b/.agents/config/defaults.yaml new file mode 100644 index 0000000..0b61051 --- /dev/null +++ b/.agents/config/defaults.yaml @@ -0,0 +1,101 @@ +# Profile B defaults (benchmark-leader assignments) +# Generated: 2026-04-23 | Session: session-20260423-141500 +# Claude roles omit effort (cli-session managed). +# +# ⚠ This file is a single source of truth (SSOT) shipped with oh-my-agent. +# Do NOT edit it directly. To customize behavior, use one of: +# - .agents/oma-config.yaml (agent_cli_mapping, session.quota_cap) +# - .agents/config/models.yaml (add or override model slugs) +# To receive newer Profile B defaults in future releases, run: +# oma install --update-defaults + +version: "2.1.0" + +agent_defaults: + orchestrator: { model: "anthropic/claude-sonnet-4-6" } + architecture: { model: "anthropic/claude-opus-4-7" } + qa: { model: "anthropic/claude-sonnet-4-6" } + pm: { model: "anthropic/claude-sonnet-4-6" } + backend: { model: "openai/gpt-5.3-codex", effort: "high" } + frontend: { model: "openai/gpt-5.4", effort: "high" } + mobile: { model: "openai/gpt-5.4", effort: "high" } + db: { model: "openai/gpt-5.3-codex", effort: "high" } + debug: { model: "openai/gpt-5.3-codex", effort: "high" } + tf-infra: { model: "openai/gpt-5.4", effort: "high" } + retrieval: { model: "google/gemini-3.1-flash-lite" } + +runtime_profiles: + claude-only: + description: "Claude-only — Max subscription holders" + agent_defaults: + orchestrator: { model: "anthropic/claude-sonnet-4-6" } + architecture: { model: "anthropic/claude-opus-4-7" } + qa: { model: "anthropic/claude-sonnet-4-6" } + pm: { model: "anthropic/claude-sonnet-4-6" } + backend: { model: "anthropic/claude-sonnet-4-6" } + frontend: { model: "anthropic/claude-sonnet-4-6" } + mobile: { model: "anthropic/claude-sonnet-4-6" } + db: { model: "anthropic/claude-sonnet-4-6" } + debug: { model: "anthropic/claude-sonnet-4-6" } + tf-infra: { model: "anthropic/claude-sonnet-4-6" } + retrieval: { model: "anthropic/claude-haiku-4-5" } + + codex-only: + description: "Codex-only — ChatGPT Plus/Pro" + agent_defaults: + orchestrator: { model: "openai/gpt-5.4", effort: "medium" } + architecture: { model: "openai/gpt-5.4-pro", effort: "high" } + qa: { model: "openai/gpt-5.4", effort: "high" } + pm: { model: "openai/gpt-5.4", effort: "medium" } + backend: { model: "openai/gpt-5.3-codex", effort: "high" } + frontend: { model: "openai/gpt-5.4", effort: "high" } + mobile: { model: "openai/gpt-5.4", effort: "high" } + db: { model: "openai/gpt-5.3-codex", effort: "high" } + debug: { model: "openai/gpt-5.3-codex", effort: "high" } + tf-infra: { model: "openai/gpt-5.4", effort: "high" } + retrieval: { model: "openai/gpt-5.4-mini", effort: "low" } + + gemini-only: + description: "Gemini-only — Google AI Pro" + agent_defaults: + orchestrator: { model: "google/gemini-3-flash" } + architecture: { model: "google/gemini-3.1-pro-preview", thinking: true } + qa: { model: "google/gemini-3-flash", thinking: true } + pm: { model: "google/gemini-3-flash" } + backend: { model: "google/gemini-3-flash", thinking: true } + frontend: { model: "google/gemini-3-flash", thinking: true } + mobile: { model: "google/gemini-3-flash", thinking: true } + db: { model: "google/gemini-3-flash", thinking: true } + debug: { model: "google/gemini-3-flash", thinking: true } + tf-infra: { model: "google/gemini-3-flash", thinking: true } + retrieval: { model: "google/gemini-3.1-flash-lite" } + + antigravity: + description: "Antigravity IDE — all impl roles fall back to external subprocess (handled at dispatch layer)" + agent_defaults: + orchestrator: { model: "anthropic/claude-sonnet-4-6" } + architecture: { model: "anthropic/claude-opus-4-7" } + qa: { model: "anthropic/claude-sonnet-4-6" } + pm: { model: "anthropic/claude-sonnet-4-6" } + backend: { model: "openai/gpt-5.3-codex", effort: "high" } + frontend: { model: "openai/gpt-5.4", effort: "high" } + mobile: { model: "openai/gpt-5.4", effort: "high" } + db: { model: "openai/gpt-5.3-codex", effort: "high" } + debug: { model: "openai/gpt-5.3-codex", effort: "high" } + tf-infra: { model: "openai/gpt-5.4", effort: "high" } + retrieval: { model: "google/gemini-3.1-flash-lite" } + + qwen-only: + description: "Qwen Code — all agents routed external (no native parallel); Qwen has no --effort, only binary --thinking" + agent_defaults: + orchestrator: { model: "qwen/qwen3-coder-next", thinking: false } + architecture: { model: "qwen/qwen3-coder-plus", thinking: true } + qa: { model: "qwen/qwen3-coder-plus", thinking: true } + pm: { model: "qwen/qwen3-coder-next", thinking: false } + backend: { model: "qwen/qwen3-coder-plus", thinking: true } + frontend: { model: "qwen/qwen3-coder-plus", thinking: true } + mobile: { model: "qwen/qwen3-coder-plus", thinking: true } + db: { model: "qwen/qwen3-coder-plus", thinking: true } + debug: { model: "qwen/qwen3-coder-plus", thinking: true } + tf-infra: { model: "qwen/qwen3-coder-plus", thinking: true } + retrieval: { model: "qwen/qwen3-coder-next", thinking: false } diff --git a/.agents/hooks/core/hud.ts b/.agents/hooks/core/hud.ts index 597b95c..73f0ac4 100644 --- a/.agents/hooks/core/hud.ts +++ b/.agents/hooks/core/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.agents/hooks/core/keyword-detector.ts b/.agents/hooks/core/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.agents/hooks/core/keyword-detector.ts +++ b/.agents/hooks/core/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record + persistent: boolean; + keywords: Record; } - > - informationalPatterns: Record - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record + >; + informationalPatterns: Record; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map() + const scores = new Map(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.agents/hooks/core/persistent-mode.ts b/.agents/hooks/core/persistent-mode.ts index 4936f4e..dfe6e06 100644 --- a/.agents/hooks/core/persistent-mode.ts +++ b/.agents/hooks/core/persistent-mode.ts @@ -13,125 +13,160 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record + workflows: Record; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── function detectVendor(input: Record): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" + const event = input.hook_event_name as string | undefined; + if (event === "AfterAgent") return "gemini"; if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +179,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.agents/hooks/core/skill-injector.ts b/.agents/hooks/core/skill-injector.ts index beda327..9ccce70 100644 --- a/.agents/hooks/core/skill-injector.ts +++ b/.agents/hooks/core/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record }> - cjkScripts?: string[] + skills?: Record }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType + const out: SkillEntry[] = []; + let entries: ReturnType; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set() - const allTriggers: string[] = [] + const seen = new Set(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record + sessions: Record; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.agents/hooks/core/test-filter.ts b/.agents/hooks/core/test-filter.ts index a0ce2fc..dfed50f 100644 --- a/.agents/hooks/core/test-filter.ts +++ b/.agents/hooks/core/test-filter.ts @@ -1,51 +1,51 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- function detectVendor(input: Record): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" + const event = input.hook_event_name as string | undefined; + if (event === "BeforeTool") return "gemini"; if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +78,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.agents/hooks/core/triggers.json b/.agents/hooks/core/triggers.json index f404583..0a1513f 100644 --- a/.agents/hooks/core/triggers.json +++ b/.agents/hooks/core/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.agents/hooks/core/types.ts b/.agents/hooks/core/types.ts index f9bf420..2b79035 100644 --- a/.agents/hooks/core/types.ts +++ b/.agents/hooks/core/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,7 +120,7 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record agent_cli_mapping > default_cli > cli-config.yaml's active_vendor > gemini - -# Response language setting (ko, en, ja, zh, ...) +# Migrated by oma migration 008 — model_preset single-file config language: ko - -# Date/time format date_format: ISO timezone: Asia/Seoul - -# Default CLI (for single tasks) -default_cli: gemini - -# Per-agent CLI mapping (multi-CLI mode) -agent_cli_mapping: - frontend: gemini - backend: gemini - mobile: gemini - qa: gemini - debug: gemini - pm: gemini +model_preset: gemini-only diff --git a/.agents/skills/_version.json b/.agents/skills/_version.json index 873bb87..22a4630 100644 --- a/.agents/skills/_version.json +++ b/.agents/skills/_version.json @@ -1,3 +1,3 @@ { - "version": "5.13.0" + "version": "6.3.1" } \ No newline at end of file diff --git a/.agents/skills/oma-backend/SKILL.md b/.agents/skills/oma-backend/SKILL.md index e980300..c102b77 100644 --- a/.agents/skills/oma-backend/SKILL.md +++ b/.agents/skills/oma-backend/SKILL.md @@ -66,10 +66,10 @@ Router (HTTP) → Service (Business Logic) → Repository (Data Access) → Mode ## Stack-Specific Reference -- Tech stack & libraries: `stack/tech-stack.md` +- **Stack manifest (SSOT)**: `stack/stack.yaml` — structured declaration (`language`, `framework`, `orm`) and `verify:` contract consumed by `oma verify backend`. Schema: `variants/stack.schema.json`. +- Tech stack narrative: `stack/tech-stack.md` — human-readable reference only; `stack.yaml` wins on conflict. - Code snippets (copy-paste ready): `stack/snippets.md` - API template: `stack/api-template.*` -- Stack config: `stack/stack.yaml` ## How to Execute diff --git a/.agents/skills/oma-image/SKILL.md b/.agents/skills/oma-image/SKILL.md new file mode 100644 index 0000000..79a806f --- /dev/null +++ b/.agents/skills/oma-image/SKILL.md @@ -0,0 +1,170 @@ +--- +name: oma-image +description: Multi-vendor AI image generation with authentication-aware parallel dispatch. Routes to Codex (gpt-image-2 via ChatGPT OAuth) and Pollinations (flux/zimage, free with signup). Gemini provider is present but disabled by default (requires billing). Use for image generation, image creation, visual asset generation, and AI art. +--- + +# Image Agent - Multi-Vendor Image Router + +## When to use + +- Generating images, visual assets, illustrations, product photos, concept art +- Comparing output between multiple image models for the same prompt +- Producing images from prompts within editor workflows (Claude Code, Codex, Gemini CLI) +- Other skills needing image generation infrastructure (shared invocation) + +## When NOT to use + +- Editing an existing image or photo manipulation -> out of scope +- Generating videos or audio -> out of scope +- Inline vector art / SVG composition from structured data -> use a templating skill +- Simple asset resizing or format conversion -> use a dedicated image library + +## Core Rules + +1. **Clarify before invoking** — if the user's request is ambiguous about subject, style, composition, or usage context, **ask the user first** or **amplify the prompt explicitly** (showing the user the expanded version for approval). Do NOT silently generate from a vague prompt. See `Clarification Protocol` below. +2. **Authentication-aware dispatch** — detect which vendor CLIs are authenticated and run only those; with `--vendor all`, every requested vendor must be available (strict). +3. **Cost guardrail** — confirm before executing runs whose estimated cost is ≥ `$0.20` (configurable). `--yes` / `OMA_IMAGE_YES=1` bypass. Default vendor `pollinations` (flux/zimage) is free, so auto-triggering on keywords is safe. +4. **Path safety** — output paths outside `$PWD` require `--allow-external-out`. +5. **Cancellable** — SIGINT/SIGTERM aborts in-flight provider calls and the orchestrator. +6. **Deterministic outputs** — every run writes `manifest.json` next to the images for reproducibility. +7. **Max `n` = 5** — wall-time bound. +8. **Exit codes align with `oma search fetch`** (0, 1, 2=safety, 3=not-found, 4=invalid-input, 5=auth-required, 6=timeout). + +## Clarification Protocol + +Before invoking `oma image generate`, the calling agent runs this checklist against the user's request. **If any answer is "no / unknown", clarify with the user first.** + +**Required signal (must be present or inferable):** +- [ ] **Subject** — what is the primary thing in the image? (object, person, scene) +- [ ] **Setting / backdrop** — where is it? (context, environment) + +**Strongly recommended (ask if absent AND not inferable from context):** +- [ ] **Style** — photorealistic, illustration, 3D render, oil painting, concept art, flat vector, …? +- [ ] **Mood / lighting** — bright vs moody, warm vs cool, dramatic vs minimal +- [ ] **Usage context** — hero image, icon, thumbnail, product shot, poster? (dictates aspect ratio + composition) +- [ ] **Aspect ratio** — square (`1024x1024`), portrait (`1024x1536`), landscape (`1536x1024`)? + +**Amplification shortcut.** For brief prompts (e.g. "a red apple"), do not pop clarifying questions if the request is genuinely that simple — instead **amplify inline and show the user** the expanded version before invoking: + +> User: "a red apple" +> Agent: "I'll generate this as: *a single glossy red apple centered on a clean white background, soft studio lighting, photorealistic, shallow depth of field, 1024×1024*. Shall I proceed, or would you like a different style/composition?" + +Skip both clarification and amplification when the user has clearly authored a full creative brief (≥ 2 of: subject + style + lighting + composition). Respect their prompt verbatim. + +**Category-specific briefs** (app mockup, poster, thumbnail, infographic, comic panel, avatar): consult `resources/prompt-tips.md` → *External Prompt Libraries*. + +**Output language.** Generation prompts are sent to the provider in English (image models are trained predominantly on English captions). Translate the user's request if they wrote in another language, and show them the translated version during amplification so they can correct misreadings. + +## Vendors + +This skill follows oh-my-agent's CLI-first concept: whenever a vendor's native CLI can drive generation (and return raw bytes), the subprocess path is preferred over direct API keys. Direct API is only used as a fallback for vendors whose CLI can't yet emit raw image bytes. + +| Vendor | Strategy | Models | Trigger | +|--------|----------|--------|---------| +| `codex` | CLI-first — `codex exec` via ChatGPT OAuth (`codex login`), built-in `image_gen` | `gpt-image-2` | Logged in via Codex CLI (no API key) | +| `pollinations` | Direct HTTP — `gen.pollinations.ai/v1/images/generations` (free signup for key) | Free: `flux`, `zimage`. Credit-gated: `qwen-image`, `wan-image`, `gpt-image-2`, `klein`, `kontext`, `gptimage`, `gptimage-large` | `POLLINATIONS_API_KEY` set (free at https://enter.pollinations.ai). No native CLI exists. | +| `gemini` | CLI-first fallback → direct API. `gemini -p` (stream) is the preferred path but currently disabled at precheck (CLI's agentic loop does not return raw `inlineData` bytes on stdout as of Gemini CLI 0.38). Until the CLI exposes a non-agentic image surface, the provider falls back to the direct `generativelanguage.googleapis.com` API. | `gemini-2.5-flash-image`, `gemini-3.1-flash-image-preview` | Preferred: `gemini auth login`. Fallback: `GEMINI_API_KEY` + billing. | + +## Invocation + +### Standalone + +``` +/oma-image a red apple on white background +/oma-image --vendor all --size 1536x1024 jeju coastline at sunset +/oma-image -n 3 --quality high --out ./hero "minimalist dashboard hero illustration" +``` + +### Shell CLI + +``` +oma image generate "" [--vendor auto|codex|pollinations|gemini|all] [-n 1..5] \ + [--size 1024x1024|1024x1536|1536x1024|auto] \ + [--quality low|medium|high|auto] \ + [--out ] [--allow-external-out] \ + [-r ]... \ + [--timeout 180] [-y] [--no-prompt-in-manifest] \ + [--dry-run] [--format text|json] +oma image doctor +oma image list-vendors +``` + +Gemini-only escalation flag: `--strategy mcp,stream,api` (overrides `vendors.gemini.strategies`). + +### Reference Images (`-r`, `--reference`) + +Attach up to 10 reference images (PNG/JPEG/GIF/WebP, ≤ 5MB each) to guide style, subject identity, or composition. Repeatable or comma-separated. + +``` +oma image generate -r ~/Downloads/otter.jpeg "same otter in dramatic lighting" +oma image generate -r a.png -r b.png "blend these two styles" +``` + +Supported vendors: + +| Vendor | Support | How | +|--------|---------|-----| +| `codex` (gpt-image-2) | ✅ | Passes `-i ` to `codex exec` | +| `gemini` (2.5-flash-image) | ✅ | Inlines base64 `inlineData` parts in request | +| `pollinations` | ❌ | Rejected with exit code 4 (requires URL hosting; see PR #2 roadmap) | + +**Paths**: absolute or relative to `$CWD`. Host CLIs usually expose attached images via: +- **Claude Code**: `~/.claude/image-cache//N.png` (surfaced in system messages as `[Image: source: ]`) +- **Antigravity**: workspace upload directory (exact path shown in IDE) +- **Codex CLI as host**: user must pass the filesystem path explicitly; in-conversation attachments are not forwarded + +### Agent Behavior: Auto-forward Attached References (MANDATORY) + +When ALL of the following are true, the calling agent MUST pass the attached image via `--reference ` automatically. Never describe the image in prose as a workaround. + +1. The user asks to generate or edit an image (referencing the attached one by phrases like "이거", "this image", "same style as this", "이 수달", etc.). +2. A host-surfaced attached image is visible to the agent — e.g. a Claude Code system message with `[Image: source: ]`, or an Antigravity workspace upload path, or an explicit filesystem path in the user's message. +3. The selected vendor supports references (`codex` or `gemini`). + +**Required action**: invoke `oma image generate --reference --vendor ""`. If the user didn't specify a vendor, default to `codex` (CLI-first, widest availability). Do NOT: + +- Fall back to prose description ("I'll describe the otter's appearance..."). +- Ask the user to re-type or re-attach the path. +- Claim the CLI doesn't support references without first running `oma image generate --help` to verify. + +**If the local CLI is outdated** (`--reference` is missing from `--help`): tell the user to run `oma update` once, then retry. Do not silently degrade to prose. + +**If the reference path is from Claude Code's `image-cache`**: note to the user that the path is session-scoped and suggest copying the file to a durable location if they want to reuse it later. Still proceed with the generation. + +### Shared Infrastructure (from other skills) + +Other skills call `oma image generate --format json` and parse the JSON manifest from stdout. + +## Output Layout + +``` +.agents/results/images/ +├── 20260424-143052-ab12cd/ # single-vendor run +│ └── pollinations-flux.jpg +│ (or codex-gpt-image-2.png) +│ manifest.json +└── 20260424-143122-7z9kqw-compare/ # --vendor all run + ├── codex-gpt-image-2.png + ├── pollinations-flux.jpg + └── manifest.json +``` + +## How to Execute + +Follow `resources/execution-protocol.md` step by step. +See `resources/vendor-matrix.md` for strategy precheck rules. +Use `resources/prompt-tips.md` for writing effective prompts. +Before submitting, run `resources/checklist.md`. + +## Configuration + +Project-specific settings: `config/image-config.yaml`. +Env vars: `OMA_IMAGE_DEFAULT_VENDOR`, `OMA_IMAGE_DEFAULT_OUT`, `OMA_IMAGE_YES`, `POLLINATIONS_API_KEY`, `GEMINI_API_KEY`, `OMA_IMAGE_GEMINI_STRATEGIES`. + +## References + +- Execution steps: `resources/execution-protocol.md` +- Vendor matrix: `resources/vendor-matrix.md` +- Prompt tips: `resources/prompt-tips.md` +- Checklist: `resources/checklist.md` +- Context loading: `../_shared/core/context-loading.md` diff --git a/.agents/skills/oma-image/config/image-config.yaml b/.agents/skills/oma-image/config/image-config.yaml new file mode 100644 index 0000000..eb68164 --- /dev/null +++ b/.agents/skills/oma-image/config/image-config.yaml @@ -0,0 +1,47 @@ +default_output_dir: .agents/results/images +default_vendor: auto +default_size: 1024x1024 +default_quality: auto +default_count: 1 +default_timeout_sec: 180 + +vendors: + codex: + enabled: true + model: gpt-image-2 + extra_args: [] + gemini: + # Disabled by default — Gemini image models require billing on AI Studio + # or Vertex AI. Flip to true after enabling billing + setting GEMINI_API_KEY. + enabled: false + model: gemini-2.5-flash-image + strategies: + - mcp + - stream + - api + pollinations: + enabled: true + model: flux + +cost_guardrail: + estimate_threshold_usd: 0.20 + per_image_usd: + codex: + gpt-image-2: + low: 0.02 + medium: 0.03 + high: 0.04 + auto: 0.03 + gemini: + gemini-2.5-flash-image: + low: 0.04 + medium: 0.04 + high: 0.04 + auto: 0.04 + +compare: + folder_pattern: "{timestamp}-{shortid}-compare" + manifest: true + +naming: + single_folder_pattern: "{timestamp}-{shortid}" diff --git a/.agents/skills/oma-image/resources/checklist.md b/.agents/skills/oma-image/resources/checklist.md new file mode 100644 index 0000000..f30fd91 --- /dev/null +++ b/.agents/skills/oma-image/resources/checklist.md @@ -0,0 +1,17 @@ +# Checklist — before you run `oma image generate` + +- [ ] Prompt is specific about scene, subject, and style (see `prompt-tips.md`). +- [ ] `--vendor` matches available authenticated CLIs. Run `oma image doctor` if unsure. +- [ ] `-n` is ≤ 5; wall time scales with count. +- [ ] `--out` is inside the project, or you've set `--allow-external-out`. +- [ ] Estimated cost is acceptable. Run `--dry-run` first for unfamiliar combinations. +- [ ] Secrets are not in the prompt, or `--no-prompt-in-manifest` is set. +- [ ] For `--vendor all`, every enabled vendor is healthy (strict mode exits 5 otherwise). + +# Checklist — after the run + +- [ ] `manifest.json` was written inside the run folder. +- [ ] Each recorded run has an `ok` status or a classified error. +- [ ] Strategy attempts are objects (not compact strings). +- [ ] Images open without corruption. +- [ ] If results are consumed downstream, the consumer parses `--format json` stdout rather than re-reading the manifest file. diff --git a/.agents/skills/oma-image/resources/execution-protocol.md b/.agents/skills/oma-image/resources/execution-protocol.md new file mode 100644 index 0000000..2a454e3 --- /dev/null +++ b/.agents/skills/oma-image/resources/execution-protocol.md @@ -0,0 +1,111 @@ +# Image Agent - Execution Protocol + +## Step -1: Clarify / Amplify Prompt (agent-side, before `oma image generate`) + +Run the **Clarification Protocol** in `SKILL.md` before shelling out. + +## Step 0: Parse Request + +1. Extract prompt and flags from the invocation. +2. Resolve defaults from `config/image-config.yaml` → env vars → CLI flags (lowest to highest precedence). +3. Validate: + - `count` ∈ [1, 5] + - `size` ∈ {`1024x1024`, `1024x1536`, `1536x1024`, `auto`} + - `quality` ∈ {`low`, `medium`, `high`, `auto`} + - `vendor` ∈ {`auto`, `codex`, `pollinations`, `gemini`, `all`} or a concrete registered name. + - `reference` (if any): each path exists, is a regular file ≤ 5MB, magic-byte-matches PNG/JPEG/GIF/WebP, ≤ 10 total, and duplicate paths are rejected with exit 4. +4. If invalid: exit code 4 and a message identifying the offending field. + +## Step 0.5: Reference Image Handling + +When `--reference ` is supplied: + +1. Validate every path via `reference-guard.ts`. On failure → exit 4. +2. Reject the request if the selected vendor(s) do not support references (currently only `codex` and `gemini`). Pollinations returns exit 4 with a hint to switch vendor. +3. Pass validated absolute paths through `GenerateInput.referenceImages`: + - `codex` provider appends `-i ` per reference to `codex exec` and adds a guidance sentence to the instruction text. + - `gemini` api strategy reads each file, base64-encodes it, and prepends `{ inlineData: { mimeType, data } }` parts before the text prompt. +4. Record reference paths in `manifest.json` under `reference_images` (top-level array of absolute paths). + +### Auto-forward attached images (MANDATORY) + +If the user asks to generate/edit an image AND a host-attached image is visible to the agent (e.g. `[Image: source: ]` in a Claude Code system message, Antigravity workspace upload, or explicit user-provided path), the agent MUST pass it via `--reference `. Do not fall back to describing the image in prose. Do not ask the user to re-type the path. If `oma image generate --help` shows no `--reference` flag, instruct the user to run `oma update` and retry — do not silently degrade. + +### Host-Specific Reference Paths + +Agents invoking `oma image generate --reference` should surface the following host-specific locations to the user: + +| Host CLI | Attachment Surface | Path pattern | +|----------|--------------------|--------------| +| **Claude Code** | `[Image: source: ...]` in system messages | `~/.claude/image-cache//.png` (undocumented, verified empirically; cache is cleared on session end) | +| **Antigravity IDE** | Workspace upload via "Upload to Agent" | Project workspace upload dir; exact path shown in IDE file tree | +| **Codex CLI as host** | `-i` flag attaches to LLM context only | No filesystem path exposed. User must provide an explicit path (e.g., `~/Downloads/foo.png`). In-conversation pastes cannot be forwarded. | +| **Gemini CLI as host** | Varies by version | Prefer explicit paths over paste | + +Agents should prefer user-supplied explicit paths (e.g., `~/Downloads/otter.jpeg`) over host-cache paths when durability across sessions matters. + +## Step 1: Vendor Selection + +1. Call `health()` on every registered provider in parallel. +2. Classify: + - `healthy` — `ok: true` + - `unhealthy` — `ok: false` with a hint +3. Decide based on `--vendor`: + - `auto`: continue with every `healthy` provider. If zero → exit 5. + - `all`: every provider must be healthy. Any missing → exit 5 naming the specific vendor. + - ``: resolve the named provider. If unhealthy → exit 5 with its hint. +4. Log `using: ` to stderr before generation. + +## Step 2: Cost Guardrail + +1. Estimate cost as `sum(per_image_usd[vendor][model][quality] × count)` over all selected vendors. +2. If `--dry-run`: print the plan (vendors, counts, outDir, cost) and exit 0. +3. If estimate ≥ `cost_guardrail.estimate_threshold_usd` and not `--yes`/`OMA_IMAGE_YES=1`: + - Prompt user on stderr: `Estimated cost $X.XX. Proceed? (y/N)` + - Decline → exit 1. + +## Step 3: Cancellation Setup + +1. Install `SIGINT`/`SIGTERM` handlers that call `AbortController.abort()`. +2. Thread the signal into every provider call via `GenerateInput.signal`. + +## Step 4: Dispatch + +- **Single vendor** — run `provider.generate(input)` sequentially. +- **Multi-vendor (`all` or `auto` with 2+ healthy)** — `Promise.allSettled` across providers. +- Providers with sub-strategies escalate internally (e.g. Gemini: `mcp → stream → api`). Record every strategy attempt (ok/skipped/failed with reason). +- Non-retryable errors (safety-refused, invalid-input) short-circuit the escalation chain. + +## Step 5: Write Artifacts + +1. Save each image to `outDir/-[-].png`. +2. Build `manifest.json` with schema version 1 (see `vendor-matrix.md` for fields). +3. If `--no-prompt-in-manifest` is set, replace `prompt` with `prompt_sha256`. + +## Step 6: Report + +1. For each run, print a one-line status to stderr: + - `[oma image] ok (Xs) -> ` + - `[oma image] failed (): ` +2. Print manifest path. +3. For `--format json`: write `{exitCode, manifestPath, runs}` to stdout as one JSON object. + +## Step 7: Exit Code Aggregation + +- Any successful run in parallel mode → exit 0 (failures still in manifest). +- All failures → pick the most specific exit code: + - `safety-refused` → 2 + - `invalid-input` → 4 + - `auth-required` / `not-installed` → 5 + - `timeout` → 6 + - otherwise → 1 + +## On Error + +| Situation | Action | +|-----------|--------| +| No vendors authenticated | Exit 5, print `Run: oma image doctor` | +| Specific vendor unhealthy | Exit 5 with the vendor's setup guide (URL + env var + steps, rendered by `oma image doctor`) | +| All sub-strategies failed for a provider | Exit 1 with last classified error; include `strategy_attempts` in manifest | +| Timeout | Exit 6, manifest records `after_ms` | +| Cancelled (Ctrl+C) | Exit 130 (signal); no manifest if abort was pre-write | diff --git a/.agents/skills/oma-image/resources/prompt-tips.md b/.agents/skills/oma-image/resources/prompt-tips.md new file mode 100644 index 0000000..0c3548f --- /dev/null +++ b/.agents/skills/oma-image/resources/prompt-tips.md @@ -0,0 +1,71 @@ +# Prompt Tips + +Good prompts are specific about scene, subject, lighting, and style. Both `gpt-image-2` (codex) and `gemini-2.5-flash-image` (gemini) respond well to the same structural cues. + +## Structure + +``` +Scene/backdrop → Subject → Details → Constraints +``` + +Example: `minimalist product photography: single white ceramic coffee cup on dark marble surface, steam rising softly, dramatic side lighting` + +## Do's + +- **Lighting**: "warm golden hour side light", "overcast diffused", "backlit with rim light" +- **Camera**: "shallow depth of field", "aerial view", "close-up macro", "35mm film grain" +- **Style**: "photorealistic", "oil painting", "3D render", "concept art", "isometric vector" +- **Mood**: "serene", "dramatic", "moody", "vibrant", "washed-out" +- **Resolution cue**: "ultra detailed", "8K", "high fidelity" + +## Don'ts + +- Avoid vague prompts like "a nice picture" — both models produce generic output. +- Don't stack contradicting styles ("photorealistic cel-shaded 3D"). +- Skip negative prompts — neither `gpt-image-2` nor `gemini-2.5-flash-image` treats them as first-class. + +## Examples + +| Category | Prompt | +|----------|--------| +| Product | `Elegant perfume bottle on reflective black surface, studio lighting, luxury brand catalog style` | +| Landscape | `Aerial drone shot of Jeju coastline, turquoise water meeting volcanic rock, golden hour` | +| Food | `Overhead flat-lay of Korean bibimbap in stone pot, steam rising, vibrant vegetables, dark wood table` | +| Architecture | `Modern minimalist house with floor-to-ceiling windows overlooking misty mountain valley` | +| Portrait | `Professional headshot, soft natural window light, shallow depth of field, neutral background` | +| UI Mockup | `iPhone 15 Pro mockup showing a fitness app dashboard, clean UI, dark mode, floating on gradient background` | +| Concept | `Lone astronaut on crater edge of Mars, looking at Earth rising on the horizon, cinematic, volumetric dust` | + +## Vendor Nuances + +- **Codex (`gpt-image-2`)** — prefers slightly longer, descriptive prompts. Quality flag `high` noticeably sharpens fine detail but doubles generation time. +- **Gemini (`gemini-2.5-flash-image`)** — robust on CJK-composed scenes ("hanok", "cherry blossom"); slightly stronger on illustrated/painterly styles. + +## Comparing Output + +`--vendor all` generates the same prompt on both providers and writes both PNGs into a `…-compare/` folder with a single `manifest.json`. Use it for A/B picking when starting a new visual style. + +## External Prompt Libraries (agent reference only) + +When the **Clarification Protocol** (see `SKILL.md`) reaches the *amplify* step, these galleries provide structural references. + +| Source | Vendor match | Prompts | Categories | +|--------|--------------|---------|-----------| +| [awesome-gpt-image-2](https://github.com/YouMind-OpenLab/awesome-gpt-image-2) | `codex` (gpt-image-2) | ~100 | Profile/Avatar · Social Post · Infographic · YouTube Thumbnail · Comic/Storyboard · Poster/Flyer · App/Web Design | +| [awesome-nano-banana-pro-prompts](https://github.com/YouMind-OpenLab/awesome-nano-banana-pro-prompts) | `gemini` (nano-banana-pro) | 10,000+ | Same taxonomy; 16 localized READMEs (en, ko-KR, ja-JP, zh-TW, de-DE, …) | + +For `pollinations` (flux / zimage), patterns from either library transfer — use the subject framing, lighting cues, and style-keyword structure, not the literal wording. + +**How to use (agent-side):** + +1. Classify the user's intent into one of the 7 categories. If no match, fall back to the *Scene/backdrop → Subject → Details → Constraints* template at the top of this file. +2. Fetch the relevant README section via `gh api` — e.g. Korean user + nano-banana: + ``` + gh api repos/YouMind-OpenLab/awesome-nano-banana-pro-prompts/contents/README_ko-KR.md \ + --jq .content | base64 -d | less + ``` + Scan headings like `### No. N: - ` to locate 1–2 analogous entries. +3. **Internalize the pattern, do not reproduce the text.** Extract lighting / framing / camera / style-keyword choices. The structural approach is not copyrightable; the expression is. +4. Compose your amplified prompt by applying that pattern to the user's subject in your own wording. Show the result to the user for approval before invoking `oma image generate`. + +**Fallback:** If no category matches, use the structural template at the top of this file. diff --git a/.agents/skills/oma-image/resources/vendor-matrix.md b/.agents/skills/oma-image/resources/vendor-matrix.md new file mode 100644 index 0000000..0065cae --- /dev/null +++ b/.agents/skills/oma-image/resources/vendor-matrix.md @@ -0,0 +1,83 @@ +# Vendor Matrix + +## Reference Image Support (`--reference` / `-r`) + +| Vendor | Reference input | Transport | Notes | +|--------|-----------------|-----------|-------| +| `codex` | ✅ | `codex exec -i <path>` (repeatable) | Local file path; 5MB-per-file cap enforced by Codex CLI | +| `gemini` | ✅ | `inlineData` parts (base64) prepended to text prompt | Up to 14 refs supported by `gemini-2.5-flash-image`; OMA caps at 10 | +| `pollinations` | ❌ | — | Requires URL hosting; rejected with exit 4. Planned for PR #2. | + +All paths are validated in `reference-guard.ts` (magic-byte MIME check + size + count + duplicate rejection) before dispatch. The magic-byte-detected MIME is threaded through `GenerateInput.referenceImages` and used verbatim at the vendor API boundary — file extension is never trusted for MIME type. + +## Codex + +| Field | Value | +|-------|-------| +| Binary | `codex` (npm: `@openai/codex`) | +| Auth | OAuth via `codex login` | +| Health check | `codex login status` output contains "Logged in" | +| Model | `gpt-image-2` | +| Transport | `codex exec "<instruction>"` — internal bridge invokes `image_gen` tool | +| Image location | `~/.codex/generated_images/<session>/ig_*.png` → copied to `outDir` | +| Sizes | `1024x1024`, `1024x1536`, `1536x1024` | +| Qualities | `low`, `medium`, `high`, `auto` | + +Codex requires `--skip-git-repo-check` for invocation inside a git worktree; this is inherited from the upstream `codex-image` skill and is a known dependency of the Codex CLI image path. + +## Gemini + +Strategies are tried in order from `vendors.gemini.strategies` (default `mcp → stream → api`). Each strategy has a `precheck()` that returns `{ ok, reason? }` — a failed precheck is recorded as `skipped` and the runner continues. + +### α — mcp + +| Field | Value | +|-------|-------| +| Requirement | `mcp-genmedia` MCP server wired into the Gemini CLI | +| Precheck | `OMA_IMAGE_GEMINI_MCP=1` (explicit opt-in) | +| Model | vendor model (default `gemini-2.5-flash-image`) | +| Status | v1: precheck scaffold only — full implementation deferred to P1+ | + +### β — stream (disabled) + +| Field | Value | +|-------|-------| +| Status | **Disabled at precheck** as of Gemini CLI 0.38 | +| Reason | `gemini -p` always runs the full agent loop. Its `stream-json` output contains `init`/`message`/`tool_use`/`tool_result` events — not raw `inlineData` image bytes. Asked to generate an image, `gemini` tries to invoke image-generation tools itself (including ironically this very `oma-image` skill), rather than returning bytes on stdout. | +| Re-enable | When Gemini CLI exposes a non-agentic image surface, update `geminiStreamStrategy.precheck()` to return `{ ok: true }` and reuse the existing parser (`extractImageFromStream`) which is still unit-tested. | +| Parser kept | `extractImageFromStream` remains in place + tested so the code path can be unbricked quickly when the CLI surface changes. | + +### γ — api + +| Field | Value | +|-------|-------| +| Requirement | `GEMINI_API_KEY` env var | +| Transport | `POST https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key=…` | +| Parsing | First candidate part with `inlineData.data` | +| Retry | 429 → throw `rate-limit` with `Retry-After` header | + +## Strategy Attempt Record + +Manifest field `strategy_attempts` is always an array of objects: + +``` +{ "strategy": "mcp" | "stream" | "api", + "status": "ok" | "skipped" | "failed", + "reason"?: string, + "duration_ms"?: number } +``` + +The last successful strategy also appears on the run as `strategy`. + +## Error Classification + +| Error kind | Retry policy | Exit code when solo | +|------------|-------------|---------------------| +| `not-installed` | fail (the vendor skips the strategy via precheck before this kind appears) | 5 | +| `auth-required` | fail; printed hint tells user how to authenticate | 5 | +| `invalid-input` | fail; surfaces validation problems from the provider | 4 | +| `safety-refused` | short-circuit — no fallback to other strategies | 2 | +| `rate-limit` | record attempt as failed; continue to next strategy | 1 (if no vendor succeeded) | +| `timeout` | record attempt as failed; continue | 6 | +| `network` | `retryable=true` → continue; `false` → record and continue | 1 | +| `other` | continue through remaining strategies | 1 | diff --git a/.agents/skills/oma-observability/SKILL.md b/.agents/skills/oma-observability/SKILL.md new file mode 100644 index 0000000..26f23bb --- /dev/null +++ b/.agents/skills/oma-observability/SKILL.md @@ -0,0 +1,225 @@ +--- +name: oma-observability +description: Intent-based observability + traceability router across layers, boundaries, and signals. Routes to vendor-specific skills via category taxonomy; owns transport tuning, meta-observability, incident forensics. Use for observability, traceability, telemetry, APM, RUM, metrics, logs, traces, profiles, SLO, incident forensics, tracing architecture work. +--- + +# Observability Agent - Intent-based Router + +## When to use +- Setting up an observability pipeline (OTel SDK + Collector + vendor backend) +- Designing traceability across service and domain boundaries (W3C propagators, baggage, multi-tenant, multi-cloud) +- Tuning transport layer (UDP/MTU, OTLP gRPC vs HTTP, Collector DaemonSet vs sidecar topology) +- Running incident forensics (6-dimension localization: code / service / layer / host / region / infra) +- Selecting a vendor category (OSS full-stack vs commercial SaaS vs high-cardinality specialist vs profiling specialist) +- Implementing observability-as-code (Grafana Jsonnet dashboards, PrometheusRule CRD, OpenSLO YAML, SLO burn-rate alerts) +- Meta-observability (pipeline self-health, clock skew detection, cardinality guardrails, retention matrix) +- Covering the MELT+P signal set: metrics, logs, traces, profiles (OTEP 0239), cost (OpenCost), audit (SOC2/ISO), privacy (GDPR/PIPA) +- Migrating off deprecated tools (Fluentd → Fluent Bit or OTel Collector, per CNCF 2025-10 guide) + +## When NOT to use +- LLM ops (prompt versioning, evals, gen_ai span deep dive) — use Langfuse, Arize Phoenix, LangSmith, or Braintrust directly +- Data pipeline lineage — use OpenLineage + Marquez, dbt test, or Airflow lineage backends +- IoT / hardware / datacenter physical-layer telemetry (IPMI, BMC, SNMP) — use vendor DCIM tooling (Nlyte, Sunbird, Device42) +- Chaos engineering orchestration — use Chaos Mesh, Litmus, Gremlin, or ChaosToolkit (this skill consumes their telemetry; it does not orchestrate chaos) +- GPU / TPU infrastructure observability — use NVIDIA DCGM Exporter + Prometheus +- Software supply chain (SBOM, attestation) — use sigstore (cosign / rekor), in-toto framework, SLSA level attestations +- Incident response workflow (on-call rotation, paging, escalation) — use PagerDuty, OpsGenie, or Grafana OnCall +- Single-vendor setup already fully covered by that vendor's own published skill — invoke the vendor skill directly + +## Core Rules +1. **Classify intent before routing**: every query goes through intent classification — setup | migrate | investigate | alert | trace | tune | route +2. **Category-first, not vendor-registry**: delegate to vendor-owned skills via `resources/vendor-categories.md`; do not duplicate their documentation +3. **Transport tuning is the moat**: UDP/MTU thresholds, OTLP protocol selection, Collector topology, and sampling recipes are in-skill depth that other skills do not cover +4. **Meta-observability is non-negotiable**: always validate pipeline self-health, clock sync (< 100 ms drift), cardinality, and retention before declaring setup complete +5. **CNCF-first preference**: Prometheus, Jaeger, Thanos, Fluent Bit, OpenFeature (Graduated 2024-11), Flagger, Falco (Graduated); OpenTelemetry, Cortex, OpenCost (Incubating) +6. **Fluentd is deprecated**: per CNCF 2025-10 migration guide, recommend Fluent Bit or OTel Collector for all new and migration work +7. **W3C Trace Context as default propagator**: translate per cloud (AWS X-Ray `X-Amzn-Trace-Id`, GCP Cloud Trace, Datadog, Cloudflare, Linkerd) via `boundaries/cross-application.md` +8. **Privacy before features**: PII redaction, sampling-aware baggage rules, and compliance (SOC2/ISO immutable audit + GDPR/PIPA erasure) are applied at collection, not only at storage +9. **Domain-level trust**: all vendor and tool references are timestamped `as of 2026-Q2`; verify live status at https://landscape.cncf.io +10. **No stub in final deliverable**: scaffolds are editing anchors only during build phase; remove before output + +## Out of Scope (use external tools) + +The combinations below are outside this skill's boundary. The external tools listed are authoritative for each domain. + +| Domain | External tools | +|--------|---------------| +| LLM ops / gen_ai observability | Langfuse, Arize Phoenix, LangSmith, Braintrust | +| Data pipeline lineage | OpenLineage + Marquez, dbt test, Apache Airflow lineage | +| L1/L2 physical / datacenter hardware | Nlyte, Sunbird, Device42; SNMP exporters where Prometheus bridge is needed | +| L5 Session / L6 Presentation full TLS inspection | Wireshark (packet-level), Cloudflare Radar (TLS ecosystem data), vendor TLS inspection tooling | +| Chaos engineering orchestration | Chaos Mesh, Litmus, Gremlin, ChaosToolkit | +| GPU / AI infra (DCGM, NVIDIA) | NVIDIA DCGM Exporter + Prometheus; OTel GPU semconv (Development, not production-ready) | +| Software supply chain (SBOM, attestation) | sigstore (cosign / rekor), in-toto framework, SLSA level attestations | +| Incident response workflow (paging, rotation) | PagerDuty, OpsGenie, Grafana OnCall | +| Fluentd (primary tool) | Deprecated CNCF 2025-10 — use Fluent Bit or OTel Collector | + +## Architecture (4 x 4 x 7 matrix) + +``` + User / Other Skill Query + | + v + +-----------------------------+ + | Intent Classifier | + | setup | migrate | investigate + | alert | trace | tune | route| + +-----------------------------+ + | + v + +-----------------------------+ + | Vendor Router | + | category-first delegation | + +-----------------------------+ + | + v + +-----------------------------+ + | vendor-categories.md | + | (a) OSS Full-Stack | + | (b) Commercial SaaS APM | + | (c) High-Cardinality | + | (d) Profiling Specialist | + | (e) SIEM / Enterprise Logs| + | (f) FinOps / Cost | + | (g) Feature Flags/Rollout | + | (h) Log Pipeline | + | (i) Time Series Storage | + | (j) Crash Analytics | + +-----------------------------+ + | + v + +-----------------------------+ + | Matrix Coverage Selector | + | 4 Layers x 4 Boundaries | + | x 7 Signals = 112 cells | + +-----------------------------+ + | + v + +-----------------------------+ + | Transport Depth / | + | Meta-observability | + | UDP, OTLP, Collector, | + | cardinality, clock skew | + +-----------------------------+ + | + v + +-----------------------------+ + | Incident Forensics | + | 6-dim localization: | + | code/service/layer/host/ | + | region/infra | + +-----------------------------+ +``` + +**Layers (4):** L3-network, L4-transport, mesh, L7-application +**Boundaries (4):** multi-tenant, cross-application, slo, release +**Signals (7):** metrics, logs, traces, profiles, cost, audit, privacy + +See `resources/matrix.md` for the full 112-cell coverage map with N/A markers for invalid combinations. + +## Routes (Intent) + +| Intent | Primary target | Fallback | +|--------|---------------|----------| +| `setup` | `resources/vendor-categories.md` → vendor-owned skill | Generic OTel semconv in `resources/standards.md` | +| `migrate` | CNCF 2025-10 guide + `resources/vendor-categories.md §(h)` | OTel Collector bridge config | +| `investigate` | `resources/incident-forensics.md` (MRA + 6-dim localization) | `signals/traces.md` + `signals/logs.md` | +| `alert` | `boundaries/slo.md` (burn-rate alert rules) | `resources/observability-as-code.md` | +| `trace` | `boundaries/cross-application.md` (propagator matrix) | `layers/mesh.md` (zero-code auto-instrumentation) | +| `tune` | `transport/` (4 files: UDP/MTU, OTLP, topology, sampling) | `resources/meta-observability.md` (cardinality guardrails) | +| `route` | `boundaries/multi-tenant.md` + `transport/collector-topology.md` | `boundaries/cross-application.md` (data residency) | + +## Invocation + +Standalone: +``` +/oma-observability "set up OTel stack on Kubernetes" +/oma-observability --migrate "move from Fluentd to Fluent Bit" +/oma-observability --investigate "5xx spike in ap-northeast-2" +/oma-observability --alert "configure SLO burn-rate alert for checkout API" +/oma-observability --trace "W3C propagator across AWS + GCP boundary" +/oma-observability --tune "UDP statsd MTU throughput limit" +/oma-observability --route "multi-tenant log isolation with data residency" +``` + +Shared invocation (from other skills): +1. State intent: `setup` | `migrate` | `investigate` | `alert` | `trace` | `tune` | `route` +2. Pass the user query string +3. Receive routed guidance or a vendor-skill delegation target + +## How to Execute +Follow `resources/execution-protocol.md` step by step. +See `resources/examples.md` for end-to-end walkthroughs. +Use `resources/intent-rules.md` for intent classification reference. +Use `resources/matrix.md` for coverage navigation across layers, boundaries, and signals. +Use `resources/vendor-categories.md` for vendor delegation and category selection. +Before submitting, run `resources/checklist.md`. + +## Integrations with OMA Ecosystem + +> **Integration status (2026-Q2)**: rows below describe **recommended handoff patterns** from the oma-observability side. As of this version, reciprocal cross-references from the other skills' SKILL.md files are not yet in place — this is a v1.1 follow-up item. Users invoking the other skills directly will need to surface this integration manually until the reciprocal links land. + +| Skill | Integration point | Reciprocal link status | +|-------|------------------|-------| +| `oma-debug` | On failure: pull traces + logs by `request_id` → trigger `resources/incident-forensics.md` 6-dim localization playbook | ⏳ pending (v1.1) | +| `oma-qa` | Canary post-deploy loop via chrome-devtools MCP: console errors + Core Web Vitals trend; INP/LCP/CLS from `layers/L7-application/web-rum.md` | ⏳ pending (v1.1) | +| `oma-tf-infra` | Terraform modules for OTel Collector, Grafana, and Loki stack provisioning | ⏳ pending (v1.1) | +| `oma-scm` | Deployment SHA → `service.version` OTel attribute + release marker events; see `boundaries/release.md` | ⏳ pending (v1.1) | +| `oma-backend` | Propagator and baggage rules cross-referenced in `backend.md` ruleset; DB N+1 + Kafka patterns in `signals/traces.md` | ⏳ pending (v1.1) | +| `oma-frontend` | `layers/L7-application/web-rum.md` INP/LCP/CLS checklist cross-referenced in `frontend.md` ruleset | ⏳ pending (v1.1) | +| `oma-mobile` | `layers/L7-application/mobile-rum.md` offline-queuing pattern cross-referenced in `mobile.md` ruleset | ⏳ pending (v1.1) | +| `oma-db` | `signals/traces.md` DB patterns (N+1, connection pool) cross-referenced in `database.md` ruleset | ⏳ pending (v1.1) | + +## Versioning & Deprecation + +- **Spec version pinning**: `otel_spec` / `otel_semconv` keys in each file's frontmatter document the assumed version. If content depends on a specific attribute stability tier, the tier is stated inline. +- **Update triggers** (not scheduled): + - OTel semconv promotion (Development → RC → Stable) affecting attributes cited in this skill → update `resources/standards.md` and the affected file, bump minor version. + - Attribute deprecation → replace across all citing files; migration note in `resources/standards.md`. + - CNCF status change for a vendor/project named in `vendor-categories.md` (Graduated / Archived / acquired) → update the vendor table. +- **Authoritative live state**: `https://landscape.cncf.io` for CNCF project status. This skill does not promise to track it on any schedule — verify at use time if the information is load-bearing. +- **No per-file review stamps**: earlier drafts carried `last_reviewed` / `next_review` frontmatter. Those were removed because no automated enforcement exists; relying on voluntary manual review produces stale stamps that misrepresent currency. Git history (`git log path/to/file`) is the source of truth for when a file was last changed. + +## Contribution Protocol + +- Do NOT pre-declare future OMA skill names in user-facing documentation. If OMA-native coverage becomes warranted for an out-of-scope domain, evaluate and name it at that point. +- File edits follow the ownership matrix in `docs/plans/oma-observability-design.md §Ownership`. CTO co-signs changes to `standards.md`, `matrix.md`, `anti-patterns.md`. +- Run `resources/checklist.md §1 Setup validation` before merging. + +## References +- Execution steps: `resources/execution-protocol.md` +- Intent classification: `resources/intent-rules.md` +- Coverage matrix: `resources/matrix.md` +- Standards (OTel spec, W3C, ISO): `resources/standards.md` +- Vendor categories: `resources/vendor-categories.md` +- Incident forensics: `resources/incident-forensics.md` +- Meta-observability: `resources/meta-observability.md` +- Observability-as-code: `resources/observability-as-code.md` +- Anti-patterns (18 items): `resources/anti-patterns.md` +- Checklist: `resources/checklist.md` +- Examples: `resources/examples.md` +- Transport: + - `resources/transport/udp-statsd-mtu.md` + - `resources/transport/otlp-grpc-vs-http.md` + - `resources/transport/collector-topology.md` + - `resources/transport/sampling-recipes.md` +- Layers: + - `resources/layers/L3-network.md` + - `resources/layers/L4-transport.md` + - `resources/layers/mesh.md` + - `resources/layers/L7-application/web-rum.md` + - `resources/layers/L7-application/mobile-rum.md` + - `resources/layers/L7-application/crash-analytics.md` +- Boundaries: + - `resources/boundaries/multi-tenant.md` + - `resources/boundaries/cross-application.md` + - `resources/boundaries/slo.md` + - `resources/boundaries/release.md` +- Signals: + - `resources/signals/metrics.md` + - `resources/signals/logs.md` + - `resources/signals/traces.md` + - `resources/signals/profiles.md` + - `resources/signals/cost.md` + - `resources/signals/audit.md` + - `resources/signals/privacy.md` diff --git a/.agents/skills/oma-observability/resources/anti-patterns.md b/.agents/skills/oma-observability/resources/anti-patterns.md new file mode 100644 index 0000000..25b2fed --- /dev/null +++ b/.agents/skills/oma-observability/resources/anti-patterns.md @@ -0,0 +1,534 @@ +# oma-observability Anti-Patterns + +> Consolidated catalog of patterns that break observability/traceability. +> Each entry: pattern name, why-it-fails, remediation, severity. +> Sources: all resource files in this skill tree. See individual "See also" references. + +## Severity legend + +- **CRITICAL**: security breach / compliance violation / production data loss +- **HIGH**: blocks on-call ability to resolve incidents +- **MEDIUM**: degrades observability quality / cost / maintainability +- **LOW**: style or minor optimization + +--- + +## A — Privacy & Sensitive Data + +### A.1 Claiming "anonymization" for pseudonymized data + +**Severity**: CRITICAL +**Why it fails**: Pseudonymized data (reversible with a key) is still personal data under GDPR Article 4(1). Misclassifying it as anonymous data leads to non-compliant retention periods, missing erasure obligations, and exposes the organization to 4% global-turnover fines. +**Remediation**: Apply the reversibility test from `signals/privacy.md §3`: "Could we recover the original value if compelled?" If yes, it is pseudonymization. Update your ROPA and re-classify accordingly. +**See also**: `signals/privacy.md §3 Anonymization vs Pseudonymization` + +### A.2 OTel Collector `hash` action on low-entropy user IDs + +**Severity**: CRITICAL +**Why it fails**: The Collector's built-in `hash` action applies SHA-256 without a salt. Numeric user IDs (e.g., 6-digit codes) are reversible via rainbow tables, making the pseudonymization ineffective and constituting a GDPR breach. +**Remediation**: Perform HMAC-SHA256 with a vault-managed key at the SDK layer before emitting spans. Store the salt in a separate region with independent IAM (GDPR Art. 32). Rotate quarterly. +**See also**: `signals/privacy.md §8 Salted Hashing Caveats` + +### A.3 Baggage carrying PII across trust boundaries + +**Severity**: CRITICAL +**Why it fails**: W3C Baggage propagates to every downstream service, including external partners. Placing `user.email`, session tokens, or credentials in baggage violates W3C Baggage §Security and GDPR Article 5(1)(c) minimization, leaking PII to untrusted collectors. +**Remediation**: Apply a baggage allowlist at every egress trust boundary (API gateway, external webhook). Allowed values: `tenant.id`, `feature.variant`, `deployment.sha`, `region.hint`. Strip all other keys. +**See also**: `boundaries/cross-application.md §4 Baggage Rules`, `signals/privacy.md §4 Common PII` + +### A.4 PII in crash stack traces without redaction filter + +**Severity**: CRITICAL +**Why it fails**: Exception messages frequently capture SQL queries, HTTP headers, and URL query strings verbatim, exposing `user.email`, `Authorization` tokens, card numbers, and passwords in vendor SaaS storage — a GDPR and PIPA breach. +**Remediation**: Implement a `beforeSend` / `before_send` allowlist hook in your crash SDK. Strip `Authorization`, `Cookie`, and any regex-matching email/card patterns before the crash report is serialized. Do not rely on server-side redaction alone. +**See also**: `layers/L7-application/crash-analytics.md §8 Privacy and PII`, `signals/privacy.md §7 SDK-Layer Redaction` + +### A.5 Raw IP addresses retained without redaction + +**Severity**: CRITICAL +**Why it fails**: IP addresses linked to natural persons are personal data under GDPR Article 4(1) and PIPA. Storing raw `srcaddr` / `client.address` in long-retention backends violates the data minimization principle (GDPR Art. 5(1)(c)) and is grounds for regulatory action. +**Remediation**: Truncate the last IPv4 octet (e.g., `203.0.113.0`) or apply rotating-salt SHA-256 pseudonymization at pipeline ingestion. Never store raw IPs beyond the 7-day raw retention tier. +**See also**: `signals/privacy.md §5 PII Handling Rules`, `layers/L3-network.md §3.3 Privacy Note` + +### A.6 `db.query.text` with untrimmed PII + +**Severity**: CRITICAL +**Why it fails**: SQL query text in `db.query.text` span attributes frequently contains `WHERE email = 'user@example.com'` or `WHERE ssn = '...'`. These surface verbatim in trace backends, violating GDPR minimization and leaking credentials. +**Remediation**: Use the OTel Collector `redaction` processor with an allowlist; block free-text query content unless it matches safe patterns. Apply SQL parameterization at the SDK layer so user data never enters query text. +**See also**: `signals/privacy.md §6 OTel Collector Processors`, `signals/traces.md` + +### A.7 Observability backend open to all engineers (no RBAC) + +**Severity**: HIGH +**Why it fails**: Production traces and logs containing even pseudonymized user data are sensitive. Unrestricted access violates the principle of least privilege, increases PII exposure risk, and fails SOC 2 CC7.2 and ISO/IEC 27001 A.8.15 access controls. +**Remediation**: Implement role-scoped access: on-call engineers see own-service traces (24h); security analysts see all services (30d); auditors get read-only audit index. Use Grafana folder permissions or Datadog Teams scopes. +**See also**: `signals/privacy.md §12 Backend RBAC` + +### A.8 Routing telemetry to 3rd-party vendor without DPA + +**Severity**: HIGH +**Why it fails**: Observability vendors receiving personal data are data processors under GDPR Article 28. Routing data before signing a Data Processing Agreement is a direct GDPR violation. +**Remediation**: Block data flows to Datadog, Sentry, Grafana Cloud, Honeycomb, New Relic, or Elastic Cloud until a DPA is signed. Confirm storage region matches your compliance obligations. Reference the legal team's approved vendor list. +**See also**: `signals/privacy.md §13 Third-Party Processor Obligations` + +### A.9 Session replay without client-side PII masking + +**Severity**: HIGH +**Why it fails**: Session replay captures DOM mutations including `<input>` fields. Without SDK-level masking, email addresses, credit card numbers, and passwords are captured in replay payloads before reaching the vendor — a GDPR Article 6 consent violation. +**Remediation**: Enable input masking in the SDK config (Sentry, Datadog both support this). Wire replay consent to the cookie consent flow. Test via automated replay review that sensitive fields are masked. +**See also**: `layers/L7-application/web-rum.md §8 Session Replay`, `signals/privacy.md §5` + +### A.10 Unencrypted telemetry queue on mobile device + +**Severity**: HIGH +**Why it fails**: Mobile telemetry queued to disk is PII at rest on user devices. If the device is lost, compromised, or forensically examined, unencrypted queue files expose personal data — violating GDPR Art. 32 and PIPA § 29 safety measures. +**Remediation**: Encrypt the queue using platform-native key storage: iOS Keychain, Android Keystore. Apply field-level redaction before write, not before send. +**See also**: `layers/L7-application/mobile-rum.md §3 Offline-First Queuing` + +### A.11 Cross-region telemetry routing without GDPR mechanism + +**Severity**: HIGH +**Why it fails**: GDPR Chapter V prohibits transfer of personal data to non-adequate countries without Standard Contractual Clauses or an adequacy decision. Routing EU telemetry through a US-hosted collector without SCC in place is a direct violation. +**Remediation**: Implement routing connector per `signals/privacy.md §10`: route EU traffic to EU-region backend, KR traffic to KR-region backend. Confirm SCC or adequacy decision with legal before routing. +**See also**: `signals/privacy.md §10 Cross-Border Transfer`, `transport/collector-topology.md §7` + +--- + +## B — Cardinality & Cost + +### B.1 `user.id` as metric label + +**Severity**: CRITICAL +**Why it fails**: Creates one TSDB time series per user. For a service with 1M users, this is 1M series — instant storage explosion, query latency degradation, and SaaS vendor bill spike. Additionally, user IDs are PII under GDPR Art. 4(1). +**Remediation**: Replace with `user.tier`, `user.cohort`, or aggregated bucket labels. Never use any unbounded identifier as a metric label. Enforce via OTel SDK View attribute allow-list. +**See also**: `signals/metrics.md §9 Cardinality Budget`, `meta-observability.md §Section C` + +### B.2 New metric name per tenant + +**Severity**: HIGH +**Why it fails**: Creating `http_requests_total_tenant_acme` for each tenant bypasses TSDB cardinality controls entirely and cannot be aggregated across tenants. It also disables cardinality budget alerting. +**Remediation**: Use `http_requests_total{tenant_id="acme"}` with a top-N cap (≤ 1000 explicit tenants). Map overflow to label value `"other"`. Apply Collector `transform` processor for normalization. +**See also**: `signals/metrics.md §9.3 Tenant Cap`, `signals/cost.md §10` + +### B.3 Raw `http.url` as metric label + +**Severity**: HIGH +**Why it fails**: URL query strings are unbounded and may contain tokens or email addresses (`?token=...`, `?email=...`). Using raw `url.full` as a label causes cardinality explosion and PII leakage in the TSDB. +**Remediation**: Use `http.route` (normalized route) instead. Apply `replace_pattern` in OTel Collector `transform` processor to strip numeric IDs and UUIDs from route segments. +**See also**: `meta-observability.md §Section C`, `signals/metrics.md §3.3 Label Rules` + +### B.4 Cost label at per-request metric granularity + +**Severity**: HIGH +**Why it fails**: Writing `gen_ai.cost.total_usd` as a metric label at request granularity creates one series per request — causing OOM on the TSDB ingestor and making cost attribution unusable. +**Remediation**: Use `gen_ai.cost.total_usd` as a span attribute only. Aggregate cost metrics by `tenant_id`, `namespace`, and `workload` at the metric surface. Configure tail-sampler to always retain spans where cost exceeds $0.50. +**See also**: `signals/cost.md §10`, `transport/sampling-recipes.md §3` + +### B.5 Summary instrument for cross-service aggregation + +**Severity**: MEDIUM +**Why it fails**: Prometheus Summary computes quantiles client-side per process. p99 from three replicas cannot be merged into a fleet-level p99 — the values are mathematically incompatible. +**Remediation**: Replace Summary with Histogram + `histogram_quantile()` at query time. Set explicit bucket boundaries matching the expected value range (e.g., seconds-scale for request duration, not the default millisecond buckets). +**See also**: `signals/metrics.md §2.5 Summary` + +### B.6 Histogram with default bucket boundaries + +**Severity**: MEDIUM +**Why it fails**: The OTel SDK default buckets are millisecond-scale `[0, 5, 10, 25, 50, ...]`. For second-scale operations (database queries, LLM inference, file uploads), all measurements land in the last bucket, making `histogram_quantile()` meaningless. +**Remediation**: Set explicit bucket boundaries per instrument view using `View(instrument_name=..., aggregation=ExplicitBucketHistogramAggregation([0.01, 0.05, 0.1, 0.5, 1, 2, 5]))`. +**See also**: `signals/metrics.md §2.4 Histogram` + +### B.7 LLM spans not tail-sampled on cost threshold + +**Severity**: MEDIUM +**Why it fails**: High-cost LLM spans (e.g., $2+ per trace) may be dropped by probabilistic sampling before they reach the backend. Silent budget blowup is invisible until the cloud billing invoice arrives. +**Remediation**: Use a `transform` processor to set `sampling.keep_reason = "high_cost"` on spans where `gen_ai.cost.total_usd > 0.50`, then add a `string_attribute` policy in `tail_sampling` that always retains these spans. +**See also**: `signals/cost.md §6`, `transport/sampling-recipes.md §3` + +--- + +## C — Pipeline & Collector + +### C.1 Missing `memory_limiter` processor + +**Severity**: CRITICAL +**Why it fails**: Without `memory_limiter`, a traffic spike or backend backpressure causes the Collector heap to grow unbounded until the process OOM-kills. All in-flight signals are lost and the pipeline is silent until the pod restarts. +**Remediation**: Add `memory_limiter` as the **first** processor in every pipeline (traces, metrics, logs). Set `limit_percentage: 75` and `spike_limit_percentage: 20`. Apply to both DaemonSet agents and gateway tiers. +**See also**: `meta-observability.md §Section A3`, `transport/collector-topology.md §2`, `transport/otlp-grpc-vs-http.md §4.3` + +### C.2 Sidecar collectors on standard Kubernetes nodes + +**Severity**: HIGH +**Why it fails**: Sidecar mode injects one Collector per application pod. A 100-pod deployment runs 100 Collectors. CPU and memory cost scales linearly with pod count, overwhelming cluster resources. Additionally, sidecar collectors see only spans from their own pod, breaking tail sampling. +**Remediation**: Use DaemonSet mode (one Collector per node) for standard Kubernetes. Reserve sidecar mode for AWS Fargate or GCP Cloud Run where DaemonSets are unavailable. +**See also**: `transport/collector-topology.md §3 When to Use Sidecar` + +### C.3 Tail sampling in sidecar + +**Severity**: HIGH +**Why it fails**: A sidecar Collector only sees spans from its own pod. A trace spanning multiple services has spans on different pods, each with a different sidecar. No single sidecar has the complete trace — sampling decisions are based on incomplete data, producing systematically wrong retention. +**Remediation**: Run `tail_sampling` processor in the gateway tier (Deployment mode) only, combined with a `loadbalancing` exporter using consistent hash by `trace_id` to ensure trace completeness. +**See also**: `transport/collector-topology.md §3`, `transport/sampling-recipes.md §2` + +### C.4 Single gateway replica + +**Severity**: HIGH +**Why it fails**: A single gateway Collector is a single point of failure. Under backend backpressure or a rolling restart, all telemetry is lost for the duration. This eliminates observability exactly when it is most needed. +**Remediation**: Deploy a minimum of 2–3 gateway replicas. Use a PodDisruptionBudget to prevent simultaneous eviction. Add a `loadbalancing` exporter upstream for trace-complete routing. +**See also**: `transport/collector-topology.md §6 High-Throughput Gateway Scaling` + +### C.5 Fluentd as new log pipeline deployment in 2026+ + +**Severity**: HIGH +**Why it fails**: CNCF announced Fluentd deprecation on 2025-10. Choosing Fluentd for a new deployment in 2026+ means adopting a deprecated tool with no future community investment, missing Fluent Bit's OTLP native output, and incurring Ruby runtime overhead (100+ MB vs Fluent Bit's 5–15 MB). +**Remediation**: Use Fluent Bit (CNCF Graduated, C/Rust, native OTLP output) as the edge DaemonSet log agent. Use OTel Collector for gateway aggregation, PII redaction, and routing. +**See also**: `signals/logs.md §6 Collector and Agent Options` + +### C.6 Prometheus receiver on multiple replicas without target allocator + +**Severity**: MEDIUM +**Why it fails**: When multiple gateway Collector replicas each run a `prometheus` receiver, every replica scrapes every target. This produces duplicate metrics in the TSDB backend and inflates series counts. +**Remediation**: Enable the OTel Operator's `spec.targetAllocator` to distribute scrape targets across replicas. Each replica scrapes a non-overlapping subset of targets. +**See also**: `transport/collector-topology.md §4 Component Placement Reference` + +### C.7 Collector self-metrics not scraped by independent instance + +**Severity**: MEDIUM +**Why it fails**: If the Collector scrapes its own `:8888/metrics` endpoint and the Collector fails, the metrics that would reveal the failure are also lost. The pipeline's health is invisible at the moment of failure. +**Remediation**: Scrape Collector self-metrics from a separate Prometheus instance or second Collector instance. Expose `otelcol_*` metrics at `service.telemetry.metrics.address: 0.0.0.0:8888` with `level: detailed`. +**See also**: `meta-observability.md §Section A1` + +--- + +## D — Sampling & Retention + +### D.1 Head-based sampling on multi-service call paths + +**Severity**: HIGH +**Why it fails**: Head-based sampling makes a sampling decision at the trace root. If a downstream service independently samples out, its spans are dropped — the reconstructed trace has gaps. Tracing backends show incomplete traces that mislead incident investigation. +**Remediation**: Use tail-based sampling in the gateway tier for multi-service systems. Propagate `traceparent` on every hop regardless of local sampling decision. The gateway buffers all spans and decides after the trace is complete. +**See also**: `transport/sampling-recipes.md §1`, `transport/sampling-recipes.md §6 Pitfalls` + +### D.2 Missing `loadbalancing` exporter before tail sampler + +**Severity**: HIGH +**Why it fails**: Without consistent-hash routing, spans for the same trace arrive at different gateway replicas. Each replica's `tail_sampling` processor sees an incomplete trace and makes wrong retention decisions — high-value traces are dropped, low-value traces are retained. +**Remediation**: Deploy `loadbalancing` exporter (with `routing_key: traceID`) in the tier upstream of `tail_sampling`. Use a headless Kubernetes Service so the exporter resolves per-pod DNS. +**See also**: `transport/sampling-recipes.md §2`, `transport/collector-topology.md §6` + +### D.3 Audit logs not stored in WORM (mutable audit storage) + +**Severity**: CRITICAL +**Why it fails**: SOC 2 CC7.2, PCI DSS Requirement 10, and HIPAA §164.312(b) require tamper-evident, immutable audit storage. Mutable audit logs can be deleted or altered, nullifying compliance evidence and enabling concealment of unauthorized actions. +**Remediation**: Apply S3 Object Lock in **Compliance** mode (not Governance), GCS retention policy with locked bucket, or Azure Immutable Blob Storage at bucket creation time. Set 7-year retention as a baseline. Do not use Governance mode — it allows privileged override. +**See also**: `signals/audit.md §5 Immutable WORM Storage` + +### D.4 Audit log retention below regulatory minimum + +**Severity**: HIGH +**Why it fails**: HIPAA requires 6-year retention; PCI DSS requires 1 year online + offline; SOC 2 audit periods are typically 12 months. Flushing audit logs before the minimum period is a direct compliance violation discoverable during any audit. +**Remediation**: Use a 7-year baseline with automated lifecycle policy (S3 → Glacier Deep Archive after 90 days). Set WORM Object Lock at write time — it cannot be applied retroactively. Monitor compliance via an `audit_log_retention_days` metric alert. +**See also**: `signals/audit.md §7 7-Year Retention Policy`, `meta-observability.md §Section F Alert 5` + +### D.5 No tamper evidence on audit trail + +**Severity**: HIGH +**Why it fails**: WORM prevents deletion but does not prove that records were not silently modified or that records are not missing. An audit trail without a hash chain or Merkle root anchoring cannot satisfy forensic integrity requirements for SOC 2 or ISO/IEC 27001. +**Remediation**: Implement per-event hash chain: `event_N.hash = SHA256(event_N.payload + event_{N-1}.hash)`. Anchor daily Merkle root to an external transparency log (rekor/sigstore). Run automated chain verification weekly. +**See also**: `signals/audit.md §6 Tamper Evidence` + +### D.6 Kubernetes audit logs routed to operational log store + +**Severity**: HIGH +**Why it fails**: Operational log stores (Loki, Elasticsearch) are mutable and have short retention policies. Kubernetes API audit logs — which record secret access, RBAC mutations, and cluster-admin actions — require WORM storage with multi-year retention to satisfy PCI DSS Requirement 10. +**Remediation**: Route K8s audit logs via a separate pipeline to the WORM cold tier (S3 Glacier Deep Archive with Object Lock). Tag with `source: k8s_apiserver`. Do not co-mingle with operational logs. +**See also**: `signals/audit.md §9 Kubernetes Audit Logs` + +### D.7 Decision wait too short in tail sampler + +**Severity**: MEDIUM +**Why it fails**: If `decision_wait` is shorter than the p99 inter-service latency, spans from slow downstream services arrive after the sampling decision is finalized. Those spans are dropped unconditionally, making tail sampling silently incomplete. +**Remediation**: Set `decision_wait` to exceed your p99 cross-service latency (typically 30–60 s for synchronous calls). For cross-region deployments, account for additional propagation delay. Monitor `otelcol_processor_queued_retry_send_queue_length` for buffer pressure. +**See also**: `transport/sampling-recipes.md §6 Pitfalls` + +--- + +## E — Release & Deployment + +### E.1 No release markers in telemetry + +**Severity**: HIGH +**Why it fails**: Without `service.version` on spans, metrics, and logs — and without a deployment event at release time — there is no way to correlate a metric anomaly or error spike to a specific deploy. Incident investigation degrades to git bisect guesswork. +**Remediation**: Set `service.version` on the OTel Resource at SDK init (injected via CI as `OTEL_RESOURCE_ATTRIBUTES`). Emit a structured deployment event at release time and pipeline it to Grafana as a vertical annotation line. +**See also**: `boundaries/release.md §9 Release Markers` + +### E.2 Canary analysis without SLI metric + +**Severity**: HIGH +**Why it fails**: Promoting a canary without a Flagger `MetricTemplate` or Argo Rollouts `AnalysisTemplate` means the promotion decision is manual and intuition-based. Regressions in error rate or latency ship silently to 100% of traffic. +**Remediation**: Gate every canary on at minimum: error rate (max +0.5% allowed) and p99 latency (max +50ms allowed). Reference PromQL SLI queries from `boundaries/slo.md` as the metric source. +**See also**: `boundaries/release.md §10 Canary Analysis Metric Suite` + +### E.3 Symbol upload not automated in CI (mobile / web) + +**Severity**: HIGH +**Why it fails**: Without automated dSYM / ProGuard / source map upload on every release, production crashes produce unreadable minified stack traces. Incident investigation time for mobile/web crashes increases by 30–60 minutes per incident. +**Remediation**: Add symbol upload as a mandatory CI step, gated on the same pipeline step as container image push or app store submission. Use Sentry CLI, `fastlane-plugin-sentry`, or Gradle Sentry plugin. Never store symbols in git LFS. +**See also**: `layers/L7-application/crash-analytics.md §9 CI Integration` + +### E.4 GitOps drift unalerted + +**Severity**: MEDIUM +**Why it fails**: When the cluster state diverges from the git manifest (OutOfSync in Argo CD, stalled Flux reconcile), the cluster is running unintended configuration. This silently breaks observability contracts — dashboards may reference metrics from a previous config version. +**Remediation**: Alert on `argocd_app_info{sync_status="OutOfSync"} == 1` and `gotk_reconcile_condition{type="Ready",status="False"} == 1`. Treat drift as an incident, not a warning. +**See also**: `boundaries/release.md §7 GitOps Engines` + +### E.5 Feature flag evaluation not observed + +**Severity**: MEDIUM +**Why it fails**: Without `feature_flag.*` span attributes on every flag evaluation, A/B experiment effects are invisible. A latency regression introduced by a new feature variant cannot be attributed to the flag without telemetry. +**Remediation**: Emit `feature_flag.key`, `feature_flag.variant`, and `feature_flag.provider_name` as span attributes on every evaluation (OTel `feature_flag.*` semconv, Experimental tier). Track `feature_flag_evaluation_total` and error rate delta per variant. +**See also**: `boundaries/release.md §6 Observing Feature Flag Evaluations` + +### E.6 Rollback without post-mortem audit trail + +**Severity**: MEDIUM +**Why it fails**: A rollback event without an audit record severs the learning loop. The same deployment failure repeats because there is no evidence of what was deployed, who authorized it, and what metric triggered rollback. +**Remediation**: Record every rollback in the audit log with: `actor.type`, `action: rollback`, `service.version` (both from and to), `event.outcome`, and `trace_id`. Cross-reference to the triggering alert. +**See also**: `signals/audit.md §3 Audit Event Categories`, `boundaries/release.md §11` + +--- + +## F — Security & Compliance + +### F.0 Audit pipeline supply-chain integrity unverified + +**Severity**: CRITICAL +**Why it fails**: If the Collector binary, Falco rules, or sigstore/rekor clients shipped into the audit pipeline are tampered with, every downstream "immutable" audit record inherits the compromise. WORM storage guarantees nothing about the ingestor's trustworthiness. +**Remediation**: Sign all audit-pipeline container images with cosign (sigstore). Pin Falco rule SHA digests in Helm values. Verify rekor transparency-log entries for rule provenance. Deploy an admission controller (e.g., Kyverno + `verifyImages`) that rejects unsigned images from the observability namespace. Cross-ref `signals/audit.md §8 Falco` and sigstore documentation. +**See also**: `signals/audit.md`, sigstore.dev + +### F.1 `traceparent` stripped at service boundary + +**Severity**: HIGH +**Why it fails**: Stripping `traceparent` on outbound calls silently breaks distributed trace continuity. The downstream service starts a new root trace, making it impossible to correlate a customer-reported request across services. MTTR increases by 15–30 minutes per incident. +**Remediation**: Every outbound HTTP/gRPC call MUST forward `traceparent`. Configure the OTel SDK auto-instrumentation to inject the header automatically. Test with an integration test that asserts the `traceparent` header on every outbound call. +**See also**: `standards.md §2.2 W3C Trace Context` + +### F.2 Mixed propagators without normalization at ingress + +**Severity**: HIGH +**Why it fails**: When B3 (Zipkin), AWS X-Ray, and W3C headers coexist without a composite propagator at the ingress gateway, spans from different origins appear as disconnected root spans in the trace backend. Waterfall correlation is impossible across cloud or vendor boundaries. +**Remediation**: Configure the ingress gateway with a composite propagator that extracts all known formats (tracecontext, b3multi, awsxray, datadog) and emits only W3C downstream. +**See also**: `boundaries/cross-application.md §3 Propagator Matrix` + +### F.3 `request_id` not exposed to frontend users + +**Severity**: MEDIUM +**Why it fails**: Without a user-visible `request_id` in error banners, customer support cannot correlate a user-reported error to backend traces. Support agents must rely on user-provided timestamps and symptoms — escalations that should take 2 minutes take 20. +**Remediation**: Return `x-request-id` in every HTTP response header. Display it in frontend error banners. Customer support uses this ID; engineers use it to pivot to `trace_id` in the log system. +**See also**: `boundaries/cross-application.md §7 request_id to trace_id Integration` + +### F.4 `service.namespace` not set on services + +**Severity**: MEDIUM +**Why it fails**: Without `service.namespace`, a system with 80+ microservices presents a flat, unordered list in trace backends and dashboards. Domain-level KPIs (e.g., payment-domain error rate vs. inventory-domain error rate) are impossible to compute. +**Remediation**: Assign `service.namespace` per DDD bounded context at deployment time via `OTEL_RESOURCE_ATTRIBUTES` or OTel Collector `resource` processor. Align namespaces with domain boundaries, not team names. +**See also**: `boundaries/cross-application.md §5 DDD Bounded Context` + +### F.5 New `trace_id` generated on DLQ replay + +**Severity**: MEDIUM +**Why it fails**: Generating a new `trace_id` at dead letter queue replay severs the forensic chain from the original failure span to the replay event. Root cause analysis of repeated failures requires tracing back to the originating request. +**Remediation**: Re-inject the original `traceparent` and `causation_id` from the failed message headers when replaying. Use a span link — not parent-child — to connect replay span to the original trace. +**See also**: `boundaries/cross-application.md §8 Idempotency and Event-Driven Trace Lineage` + +--- + +## G — Frontend / Mobile + +### G.1 3rd-party script loaded without CSP monitoring + +**Severity**: HIGH +**Why it fails**: Third-party scripts loaded without a Content Security Policy are an unmonitored XSS and supply-chain attack vector. A compromised CDN script executes in the user's browser with full page access. LCP regressions from script load delays are also invisible without attribution. +**Remediation**: Add `Content-Security-Policy` header with `report-to` endpoint. Pipe CSP violation reports to the log backend and alert on new `blocked-uri` origins. Pin script hashes with Subresource Integrity (`integrity="sha384-..."`). +**See also**: `layers/L7-application/web-rum.md §6 Third-Party Scripts and CSP` + +### G.2 Source maps not uploaded to error vendor + +**Severity**: HIGH +**Why it fails**: Minified production bundles produce unreadable stack traces: `at t.<anonymous> (bundle.min.js:1:74821)`. Without source maps, engineers cannot identify the failing line of code. Incident investigation for frontend crashes is impossible. +**Remediation**: Upload source maps to the error tracking vendor on every CI release pipeline step, before the release is considered complete. Use Sentry CLI or equivalent. Never store source maps in git LFS — use vendor symbol storage. +**See also**: `layers/L7-application/web-rum.md §9 Error Tracking`, `layers/L7-application/crash-analytics.md §3` + +### G.3 FID still reported in dashboards after March 2024 + +**Severity**: MEDIUM +**Why it fails**: First Input Delay (FID) was removed from Core Web Vitals in March 2024 and replaced by Interaction to Next Paint (INP). Dashboards still reporting FID mislead SLO reviews — FID scores passing does not mean the INP SLO is met. +**Remediation**: Replace FID with INP in all dashboards and OpenSLO definitions. SLI target: INP p75 ≤ 200 ms. Use `web-vitals` JS library v4.x which provides `onINP`. +**See also**: `layers/L7-application/web-rum.md §2 Core Web Vitals` + +### G.4 `propagateTraceHeaderCorsUrls` / `allowedTracingUrls` not configured + +**Severity**: MEDIUM +**Why it fails**: Browser CORS preflight rejects injection of the `traceparent` header to origins not listed in the SDK allowlist. Client-to-server trace correlation silently breaks — frontend traces appear disconnected from backend traces. +**Remediation**: Add all API origin patterns to `FetchInstrumentation`'s `propagateTraceHeaderCorsUrls` (OTel JS) or Datadog RUM's `allowedTracingUrls`. Test in a browser network inspector to confirm the header is present. +**See also**: `layers/L7-application/web-rum.md §5 Client-to-Server Error Correlation` + +### G.5 Missing `traceparent` injection on mobile outbound HTTP + +**Severity**: MEDIUM +**Why it fails**: Without `traceparent` on outbound requests from the mobile app, the mobile user session is invisible in backend distributed traces. Customer-reported mobile errors cannot be linked to backend spans. +**Remediation**: Configure the mobile SDK's HTTP interceptor at initialization time (not at individual call sites). Verify with a network proxy tool (Charles, mitmproxy) that `traceparent` appears on all outbound API calls. +**See also**: `layers/L7-application/mobile-rum.md §9 W3C Trace Context Propagation` + +### G.6 No event TTL on mobile offline queue + +**Severity**: MEDIUM +**Why it fails**: Stale events queued on-device for hours or days eventually upload when the network reconnects. Events with device timestamps that are 24+ hours old mislead dashboards and SLO calculations — a crash from two days ago appears as a current incident. +**Remediation**: Set an event TTL of 24–72 hours on the mobile offline queue. Drop events that exceed the TTL before upload, not after. Log dropped event counts as a metric for monitoring queue health. +**See also**: `layers/L7-application/mobile-rum.md §3 Offline-First Queuing` + +### G.7 No release marker for crash correlation on mobile + +**Severity**: MEDIUM +**Why it fails**: Without `service.version` set on every crash event and without a release marker event at deploy time, a crash rate spike cannot be attributed to a specific app version. Investigation requires manual version comparison across crash groups. +**Remediation**: Set `service.version` as a custom key on every crash report. Automate release marker events at submission time. Use the release marker to draw vertical lines on CFR trend charts for before/after comparison. +**See also**: `layers/L7-application/crash-analytics.md §4 Release Tracking Integration` + +--- + +## H — Network / BGP / Clock + +### H.1 NTP drift left unmonitored + +**Severity**: HIGH +**Why it fails**: Distributed traces depend on synchronized clocks. When two nodes diverge by more than 100 ms, waterfall charts show child spans starting before their parent — engineers chase phantom race conditions instead of real bugs. MTTR increases by hours. +**Remediation**: Emit `node_clock_drift_ms` from every host (chrony textfile collector or node exporter). Alert when drift exceeds 100 ms for 5 minutes. Run `chronyc makestep` to force resync. For financial / telco workloads requiring sub-ms precision, use PTP (IEEE 1588). +**See also**: `standards.md §6 Clock Discipline`, `meta-observability.md §Section B` + +### H.2 Own-ASN BGP hijack left unmonitored + +**Severity**: HIGH +**Why it fails**: A BGP prefix hijack diverts traffic to a rogue AS, which may cause a complete outage or a silent man-in-the-middle attack. Without MOAS detection, hijacks go undetected for hours or days while traffic is silently stolen. +**Remediation**: Deploy BGPalerter or ARTEMIS with MOAS detection for your ASN prefixes. Subscribe to Cloudflare Radar alerts for your prefixes. Validate RPKI ROA records at your RIR and enable ROV enforcement on border routers. +**See also**: `layers/L3-network.md §6.4 Security Observability` + +### H.3 PMTUD black hole left uncorrected + +**Severity**: HIGH +**Why it fails**: When firewalls block ICMP Type 3 Code 4 ("Fragmentation Needed"), PMTUD fails silently. Large TCP transfers stall while health checks (small packets) pass — masking the problem. Services appear healthy in monitors while bulk data transfers time out. +**Remediation**: Enable MSS clamping at VPN/tunnel endpoints. Allow ICMP Type 3 Code 4 through security groups. Verify the fix with `ping -M do -s 1472 <destination>`. Cross-reference UDP MTU constraints for StatsD pipelines. +**See also**: `layers/L3-network.md §4 PMTUD`, `transport/udp-statsd-mtu.md §2` + +### H.4 RPKI-ROV not configured on advertised prefixes + +**Severity**: HIGH +**Why it fails**: An IP prefix announced without a valid RPKI ROA record is marked "Not Found" by downstream validators — not "Invalid", but also not cryptographically anchored. Rogue AS announcements for your prefix are undetectable by validators, increasing hijack risk. +**Remediation**: Create ROA records at your RIR (ARIN, RIPE NCC, APNIC) for all advertised prefixes. Enable ROV enforcement on border routers to drop or de-prefer RPKI Invalid routes. +**See also**: `layers/L3-network.md §6.4 RPKI-ROV` + +### H.5 Connection pool observability absent + +**Severity**: HIGH +**Why it fails**: Pool queue saturation causes application latency spikes that are invisible in TCP metrics alone. Neither retransmit rate nor error rate spikes until connection timeouts fire — engineers investigate network issues while the actual problem is a saturated database pool. +**Remediation**: Instrument connection pool size, wait time, and timeout counters at the application layer (not only at the TCP layer). Alert on pool utilization > 80% before timeouts occur. +**See also**: `layers/L4-transport.md §3.3 Common Pitfalls` + +### H.6 QUIC adoption without HTTP/3 trace tooling validation + +**Severity**: HIGH +**Why it fails**: Enabling QUIC without verifying that OTel SDKs emit `network.transport: quic` and `network.protocol.version: "3"` creates a transport-layer blind spot. UDP-based QUIC flows are invisible in TCP metrics (`/proc/net/tcp`), leaving L4 observability dark. +**Remediation**: Add a canary assertion in staging: verify `network.transport: quic` appears in spans and that Envoy access logs show QUIC connection IDs. Confirm tooling (Beyla QUIC uprobe) before production rollout. +**See also**: `layers/L4-transport.md §7 QUIC / HTTP3 Transport Semantics` + +### H.7 eBPF agent deployed without kernel/capability preflight + +**Severity**: HIGH +**Why it fails**: Beyla or Pixie DaemonSets on incompatible kernels (< 4.14) or without CAP_BPF fail silently, providing no operator-visible error or metric. The observability gap is discovered only during an incident. +**Remediation**: Add an `initContainer` that asserts kernel version (`uname -r >= 4.14`) and CAP_BPF presence. Exit non-zero if requirements are unmet. This surfaces the incompatibility during deployment, not during an incident. +**See also**: `layers/L4-transport.md §5.2 Kernel and Privilege Requirements` + +--- + +## I — As-Code & GitOps + +### I.1 Production dashboards edited directly in UI + +**Severity**: HIGH +**Why it fails**: UI-edited dashboards have no version history, no rollback path, no peer review, and no audit trail. SOC 2 change-management controls require an audit-traceable change process for detection and response configuration. A dashboard overwritten by mistake cannot be recovered without a backup. +**Remediation**: Version all dashboards in git using Grafonnet (Jsonnet) or Terraform Grafana provider. Apply via CI/CD only. Gate on PR review and linting (`jsonnetfmt --test`). Treat the git history as the audit trail. +**See also**: `observability-as-code.md §1 Why Observability-as-Code` + +### I.2 Alert thresholds hardcoded per-environment without parameterization + +**Severity**: MEDIUM +**Why it fails**: Hardcoded thresholds in per-environment alert YAML files diverge silently over time. Production runs a different error-rate threshold than staging, masking regressions that staging was supposed to catch. +**Remediation**: Parameterize alert thresholds as Jsonnet or Terraform variables. Derive environment-specific values from a shared defaults map. Apply the same alert code to all environments with environment-scoped variable overrides. +**See also**: `observability-as-code.md §2 Dashboards-as-Code` + +### I.3 SLO definitions stored only in the vendor UI + +**Severity**: MEDIUM +**Why it fails**: SLO definitions stored only in Datadog, Grafana Cloud, or Honeycomb UIs are not version-controlled, not peer-reviewed, and cannot be reconstructed after a vendor migration. SLO drift goes undetected. +**Remediation**: Define SLOs in OpenSLO YAML, committed to git. Apply via sloth or the vendor's Terraform provider. The git commit history is the audit trail for SLO changes. +**See also**: `observability-as-code.md`, `boundaries/slo.md` + +--- + +## Z — Cross-cutting + +### Z.1 `trace_id` missing from log records + +**Severity**: HIGH +**Why it fails**: Without `trace_id` and `span_id` on every log record, log-trace join during incident forensics fails. Engineers cannot pivot from a log error to the distributed trace waterfall, adding 15–30 minutes to MTTR per incident. +**Remediation**: Inject `trace_id` and `span_id` into every log record via the OTel SDK context hook (Python: `structlog`; Java: Logback `OpenTelemetryAppender`; Node.js: `pino-opentelemetry-transport`). Assert in integration tests. +**See also**: `signals/logs.md §7 Trace ID Injection Rules`, `incident-forensics.md §2.3` + +### Z.2 `service.version` missing from resource attributes + +**Severity**: HIGH +**Why it fails**: Without `service.version` on every signal, before/after comparison across a release is impossible. Canary analysis, SLO delta calculation, and post-incident release attribution all fail. +**Remediation**: Set `service.version` on the OTel Resource at SDK initialization. Inject via CI as `OTEL_RESOURCE_ATTRIBUTES=service.version=${GIT_SHA}`. Never set it per-signal — the Resource is the single source. +**See also**: `incident-forensics.md §2.1 Resource Attributes`, `boundaries/release.md §9` + +### Z.3 Pipeline delivery ratio unmonitored + +**Severity**: HIGH +**Why it fails**: If the OTel Collector is silently dropping 10% of traces, every SLO dashboard and alert is built on incomplete data. The pipeline degradation is invisible until SLO violations appear — at which point on-call cannot distinguish real incidents from telemetry gaps. +**Remediation**: Alert when `sum(rate(otelcol_exporter_sent_spans[5m])) / sum(rate(otelcol_receiver_accepted_spans[5m])) < 0.99` for 5 minutes. This is the single most important meta-observability alert. +**See also**: `meta-observability.md §Section A6`, `meta-observability.md §Section F Alert 1` + +### Z.4 Tenant ID absent from multi-tenant telemetry + +**Severity**: HIGH +**Why it fails**: In a multi-tenant system, telemetry without `tenant.id` makes per-tenant SLO computation, chargeback, and incident isolation impossible. All tenants are indistinguishable in dashboards — a single noisy tenant can mask SLO violations for the entire fleet. +**Remediation**: Propagate `tenant.id` via W3C Baggage from the API gateway through all downstream services. Emit it on every span, log record, and metric data point. Apply top-N cap (≤ 1000) when used as a metric label. +**See also**: `incident-forensics.md §2.3`, `boundaries/multi-tenant.md` + +### Z.5 Incident forensics without 6-dimension MRA attributes + +**Severity**: HIGH +**Why it fails**: Missing Minimum Required Attributes (`service.name`, `service.namespace`, `service.version`, `deployment.environment`, `cloud.region`, `k8s.pod.name`) break the 6-dimension narrowing flow — Code / Service / Layer / Host / Region / Infra pivots fail silently. +**Remediation**: Enforce MRA completeness at CI via an OTel attribute coverage gate. Set all resource attributes via `OTEL_RESOURCE_ATTRIBUTES` or the OTel Collector `resource` processor. Validate with `otelcol` debug exporter in staging before production rollout. +**See also**: `incident-forensics.md §2 Minimum Required Attributes` + +### Z.6 Cost dashboard accessible to all engineers without RBAC + +**Severity**: MEDIUM +**Why it fails**: Per-tenant cost data reveals revenue tier, contract value, and resource consumption patterns. Exposing this to all engineers violates least-privilege access principles and may constitute a data breach under GDPR if cost data is linked to identifiable customers. +**Remediation**: Separate cost dashboards by role: Finance sees full cost by tenant; Platform Engineering sees cost by namespace/workload; Application Engineering sees only their own service. Apply Grafana folder permissions or OPA policies. +**See also**: `signals/cost.md §9 Privacy & Access Control` + +--- + +## Contribution Protocol + +When adding an entry from another doc: + +1. Place it in the correct section (A–Z) by primary concern. +2. If the same anti-pattern appears in multiple source files, one canonical entry only — list all sources in "See also". +3. Use the format: `### {Section}.{n} {Pattern name}` / `**Severity**` / `**Why it fails**` / `**Remediation**` / `**See also**`. +4. Order within each section: CRITICAL → HIGH → MEDIUM → LOW. +5. All cross-references use relative paths from the `resources/` root. +6. No forward references to planned but unwritten files. diff --git a/.agents/skills/oma-observability/resources/boundaries/cross-application.md b/.agents/skills/oma-observability/resources/boundaries/cross-application.md new file mode 100644 index 0000000..8d0bc5e --- /dev/null +++ b/.agents/skills/oma-observability/resources/boundaries/cross-application.md @@ -0,0 +1,330 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +specs: + - "W3C Trace Context: Level 1 Recommendation 2020-02-06; Level 2 Candidate Recommendation" + - "W3C Baggage: Recommendation 2022-12-22" +--- + +# Cross-Application Observability Boundary + +## 1. Scope + +This file governs trace context, request correlation, causation chains, and baggage propagation across application boundaries — including service-to-service, cross-cloud, and cross-mesh calls. + +Design decision D3 absorbs two formerly separate documents into this file: + +- **Propagators** (formerly `propagators.md`): per-ecosystem header formats and normalization rules for W3C, B3, AWS X-Ray, GCP, Azure, Datadog, Cloudflare, Istio/Envoy, and Linkerd. +- **DDD bounded context** (formerly `multi-domain.md`): `service.namespace` as the domain grouping attribute for multi-service architectures. + +Cross-references to other files in this skill use the `../` prefix convention. + +--- + +## 2. 4-Layer Correlation Model + +Distributed systems require four distinct correlation identifiers, each serving a different audience and cardinality need. Using only `trace_id` conflates concerns; using only `request_id` loses async chain visibility. + +| Layer | Identifier | Propagation mechanism | Cardinality | Primary audience | +|-------|-----------|----------------------|-------------|-----------------| +| Infra / Trace | `trace_id` / `span_id` | W3C `traceparent` header | 1 per trace | Platform engineers, trace backends | +| Correlation | `request_id` (`x-request-id`) | HTTP header + log field `http.request.id` | 1 per request | Customer support, error banners, frontend users | +| Causation | `causation_id` | Message headers (Kafka, SQS, RabbitMQ) | 1 per event chain | Async workflow lineage, CQRS / event sourcing | +| Idempotency | `idempotency_key` | HTTP header (`Idempotency-Key`) or message header | 1 per business operation | Duplicate-processing guard; distinct from causation | +| Business | `user.id`, `tenant.id`, `order.id` | W3C Baggage (`baggage` header) | Per business entity | Product analytics, cross-cutting dashboards | + +> **Causation vs. idempotency are distinct** — `causation_id` expresses event ancestry (what triggered this event) and is often unique per edge in an event chain; `idempotency_key` is a business-level deduplication token shared by all retries of the same operation. Using a single header for both breaks dedup logic during retry if the causation chain diverges. Implement as two separate attributes / headers. + +### 2.1 trace_id + +Auto-generated by the OTel SDK at the trace root. 128-bit, encoded as 32 lowercase hex characters. Carried in `traceparent`. This is an **internal platform identifier** — do not expose it directly to end users because it reveals internal infrastructure topology. Its role is to be the carrier that links all spans in one distributed trace. + +### 2.2 request_id + +Human-facing correlation ID. Must appear in: + +- Every outbound HTTP response header (`x-request-id`) +- Every log record as a structured field (`http.request.id` span attribute) +- User-visible error banners on the frontend (support ticket anchor) + +The frontend generates a UUIDv4 `request_id` and sends it on every outbound request. Middleware sets the span attribute and logs the field. Customer support agents hand users a `request_id` from the error banner; platform engineers query logs to retrieve `trace_id` and pivot to the distributed trace waterfall. + +Cross-ref `../incident-forensics.md §MRA` for the support-to-trace lookup flow. + +### 2.3 causation_id / idempotency_key + +Event-driven and async workflows break the synchronous call chain. A Kafka consumer processing a payment event may run minutes after the HTTP request that produced it. `causation_id` links the new trace back to the originating event chain without forcing a parent-child span relationship. + +- HTTP POST: set `Idempotency-Key` request header → record as span attribute +- Kafka producer: write `causation_id` into Kafka message header +- Kafka consumer: extract `causation_id` → create OTel span link (not parent-child) to the originating span + +Span links are the correct OTel primitive for async causation. Cross-ref `../signals/traces.md §Messaging patterns`. + +### 2.4 Business context via Baggage + +Business identifiers (`user.id`, `tenant.id`, `order.id`, `feature.variant`) propagate as W3C Baggage key-value pairs. They flow through every service on the call path and appear as span attributes and log fields, enabling cross-cutting dashboard filters (e.g., "all traces for tenant ACME in the last hour"). + +Baggage carries PII risk. See Section 4 for trust-boundary filtering rules. + +--- + +## 3. Propagator Matrix + +The recommended default for all new deployments is **W3C Trace Context**. At ingress or multi-cloud boundaries, extract all known propagator formats and emit W3C downstream — normalization at the edge prevents split-brain traces inside the cluster. + +| Ecosystem | Header(s) | OTel propagator | Normalization notes | +|-----------|-----------|----------------|---------------------| +| **W3C Trace Context** (default) | `traceparent`, `tracestate`, `baggage` | `tracecontext`, `baggage` | L1 Rec 2020-02-06; L2 CR adds random-id flag. Mandatory baseline for all new services. | +| **B3** (Zipkin legacy) | `x-b3-traceid`, `x-b3-spanid`, `x-b3-parentspanid`, `x-b3-sampled` | `b3multi` / `b3` (single-header) | Use multi-header format. Keep as secondary propagator for Zipkin-legacy services; migrate to W3C for new work. | +| **AWS X-Ray** | `X-Amzn-Trace-Id` (`Root=1-<ts>-<hex>;Parent=<id>;Sampled=<0\|1>`) | `awsxray` | AWS X-Ray SDK supports W3C output since 2022. Prefer W3C; use X-Ray propagator only for Lambda or services that require X-Ray console trace linking. | +| **GCP Cloud Trace** | `traceparent` (W3C preferred), legacy `X-Cloud-Trace-Context` | `gcp` propagator | GCP supports both formats. Emit W3C; accept legacy for backward compat. | +| **Azure Monitor** | `traceparent` (W3C preferred), legacy `Request-Id` | W3C preferred | Application Insights SDK v3+ defaults to W3C. Treat `Request-Id` as read-only legacy input only. | +| **Datadog APM** | `x-datadog-trace-id`, `x-datadog-parent-id`, `x-datadog-sampling-priority` | `datadog` propagator | Configure Datadog Agent to accept W3C input; use OTel Datadog exporter for span forwarding. | +| **Cloudflare Workers** | `cf-ray` + W3C `traceparent` | Map `cf-ray` → baggage | `cf-ray` is not a trace ID. Map it to `baggage: cf.ray=<value>` for correlation. Emit W3C `traceparent` on Worker egress calls. | +| **Istio / Envoy** | W3C `traceparent` + Envoy internal `x-request-id`, `x-ot-span-context` (legacy) | W3C (OTLP tracer, Envoy 1.29+) | `x-ot-span-context` is deprecated as of Envoy 1.29. Cross-ref `../layers/mesh.md §Propagator headers`. | +| **Linkerd** | `l5d-ctx-trace`, `l5d-ctx-span`, `l5d-ctx-parent`, `l5d-ctx-deadline` | `linkerd` propagator | Linkerd headers are not W3C-compatible. Translate to W3C at the mesh boundary gateway. Cross-ref `../layers/mesh.md §Cross-mesh compatibility rule`. | + +### 3.1 Multi-propagator strategy at ingress + +Configure the ingress gateway or API gateway collector with a composite propagator that extracts all known formats. After extraction, emit only W3C downstream: + +``` +Composite extract order (ingress): + 1. tracecontext (W3C — primary) + 2. b3multi (Zipkin legacy) + 3. awsxray (AWS Lambda / API Gateway) + 4. datadog (Datadog-instrumented upstream) + +Inject order (egress, internal): + 1. tracecontext (W3C — only format emitted internally) + 2. baggage (W3C Baggage — always) +``` + +This pattern ensures that a request arriving from an AWS Lambda (X-Ray header) or a Datadog-instrumented partner API is absorbed into the W3C trace context and propagated uniformly to all internal services. + +--- + +## 4. Baggage Rules and PII Guidance + +W3C Baggage propagates to every service on the call path. The W3C Baggage specification §Security states: + +> "Application owners should either ensure that no proprietary or confidential information is stored in baggage, or ensure baggage is not present in requests that cross trust boundaries." + +The OTel Baggage API spec does not independently repeat this warning — the normative source is the W3C parent specification. Implementers MUST read both. + +Cross-ref `../signals/privacy.md §Baggage rules` for the enforcement implementation. + +### 4.1 Trust-boundary rule + +At every egress trust boundary (API gateway, external webhook, third-party vendor API), apply a baggage filter: + +- **Default**: strip all baggage unless the receiving service is on the explicit allowlist. +- **Allowlist**: maintained per-environment; requires security team sign-off to add entries. +- **Internal-to-internal**: baggage passes through without filtering (intra-cluster calls within the same trust zone). + +### 4.2 Allowed and prohibited baggage values + +| Allowed (safe at any trust level) | Prohibited (must never appear in baggage) | +|-----------------------------------|------------------------------------------| +| `tenant.id` (opaque identifier) | `user.email` | +| `user.tier` (tier name, not PII) | Authentication tokens or session IDs | +| `feature.variant` | Credit card numbers or PAN | +| `deployment.sha` | Passwords or API keys | +| `region.hint` | Any field classified as PII under GDPR / PIPA | + +### 4.3 Enforcement point + +The mesh ingress gateway is the primary enforcement point. Configure Envoy's Lua filter or an OTel Collector `attributes` processor at the gateway to drop prohibited keys. Cross-ref `../layers/mesh.md §Baggage scrubbing`. + +--- + +## 5. DDD Bounded Context via service.namespace + +OpenTelemetry resource attributes `service.name` and `service.namespace` map directly to DDD concepts: + +| OTel attribute | DDD concept | Example | +|----------------|-------------|---------| +| `service.namespace` | Bounded context (domain group) | `payments`, `inventory`, `identity` | +| `service.name` | Individual microservice | `checkout-api`, `stock-api`, `auth-service` | + +### 5.1 Why service.namespace matters + +Without `service.namespace`, a system with 80+ microservices presents a flat list in trace backends and dashboards. Operators cannot distinguish domain-level KPIs from service-level noise. + +With `service.namespace`, dashboards can be scoped: + +- `service.namespace = payments` → filter all payment-domain spans, compute payment-domain SLI +- `service.namespace = inventory` → separate inventory burn-rate alert from payments alert + +### 5.2 Configuration + +Set `service.namespace` as an OTel resource attribute at deployment time, not at instrumentation time: + +```yaml +# OTel Collector resourcedetection / resource processor +processors: + resource: + attributes: + - key: service.namespace + value: payments + action: upsert + - key: service.name + value: checkout-api + action: upsert +``` + +Alternatively, set via environment variable that the SDK's Environment Resource Detector picks up: + +``` +OTEL_RESOURCE_ATTRIBUTES=service.namespace=payments,service.name=checkout-api +``` + +### 5.3 Anti-pattern: flat service registry + +One hundred services without namespace grouping causes cognitive overload in the trace backend service map. Namespace assignment is a required step in any multi-service deployment. Assign namespaces aligned with DDD bounded contexts, not with team names or infrastructure regions. + +--- + +## 6. Cross-Cloud Trace Continuity + +When a request crosses cloud provider boundaries (e.g., App A on AWS calls App B on GCP calls App C on-premises), trace continuity requires W3C `traceparent` on every hop. + +| Provider | W3C support status | +|----------|--------------------| +| AWS X-Ray SDK | W3C output supported since 2022 | +| GCP Cloud Trace | W3C `traceparent` preferred; legacy `X-Cloud-Trace-Context` still accepted | +| Azure Monitor | W3C `traceparent` preferred; Application Insights SDK v3+ default | +| Jaeger (self-managed) | W3C Trace Context since Jaeger v1.35 | + +Collector bridge strategy: deploy an OTel Collector at each cloud egress point. The Collector normalizes incoming vendor headers to W3C before forwarding to the next cloud's ingress. This ensures that a trace rooted in AWS survives the GCP boundary and terminates in an on-premises Jaeger backend with an unbroken span chain. + +Cross-ref `../transport/collector-topology.md §Multi-cluster/regional` for Collector deployment patterns. + +--- + +## 7. request_id to trace_id Integration + +The support escalation flow depends on reliable linkage between the user-visible `request_id` and the internal `trace_id`. + +**Flow:** + +1. Frontend generates UUIDv4 `request_id` and sends it on every outbound request. +2. Backend middleware extracts `request_id`, sets span attribute `http.request.id`, and writes it as a structured log field. +3. On error, the frontend displays the `request_id` to the user (never `trace_id` — the latter leaks internal topology). +4. Customer support receives the `request_id` from the user's error screenshot. +5. Platform engineer queries the log system: `http.request.id = "<request_id>"` → retrieves `trace_id` from the matching log record. +6. Platform engineer opens the trace waterfall in the trace backend using `trace_id`. + +Cross-ref `../incident-forensics.md §MRA` for the full minimum reproducible artifact workflow. + +--- + +## 8. Idempotency and Event-Driven Trace Lineage + +Idempotency keys and causation IDs are first-class observability primitives in event-driven architectures. + +**HTTP pattern (Stripe-style):** + +- Client sends `Idempotency-Key: <uuid>` header on POST. +- Server records the key as span attribute `http.idempotency_key`. +- On duplicate detection, the existing response is returned; the `trace_id` of the original attempt is logged alongside the duplicate attempt's `trace_id`. + +**Kafka / SQS pattern:** + +- Producer writes `causation_id` (UUIDv4) and original `traceparent` into message headers. +- Consumer extracts both headers before creating the `CONSUMER` span. +- Consumer span links to the producer span via `span.addLink(producerSpanContext)`. +- The link — not parent-child — preserves trace lineage without forcing synchronous ordering assumptions. + +**DLQ replay:** + +- When replaying from a dead letter queue, re-inject the original `traceparent` and `causation_id` from the failed message headers. Generating a new `trace_id` at replay severs the forensic chain. + +Cross-ref `../signals/traces.md §Messaging patterns` for span link implementation details. + +--- + +## 9. Trust Boundary Patterns + +| Boundary type | Trace behavior | Baggage behavior | +|---------------|---------------|-----------------| +| Internal service to internal service (same cluster) | Forward `traceparent` unchanged | Forward baggage unchanged | +| Internal service to external API gateway (outbound) | Create new CLIENT span; include `traceparent` only if vendor is on allowlist | Strip all baggage unless vendor is on explicit allowlist | +| B2B webhook (outbound to partner) | Start a new trace; log original `trace_id` as annotation in the outbound span only | Do not propagate baggage | +| Third-party vendor API call | Do not propagate `traceparent` unless vendor explicitly supports it | Do not propagate baggage | +| External partner calling inbound | Extract `traceparent` if present; validate and sanitize | Strip baggage at ingress gateway; reattach only allowlisted keys | + +--- + +## 10. Context Inheritance in Non-Interactive Sessions + +CI/CD pipelines, agent workflows, and background jobs need trace context for end-to-end visibility across automated steps. + +Use the `TRACEPARENT` environment variable for child process context inheritance. The OTel SDK's Environment Resource Detector reads `TRACEPARENT` and applies it as the parent context for the first span created in that process. + +```bash +# CI/CD pipeline: propagate trace context to child build steps +export TRACEPARENT="00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01" +# Child process (e.g., test runner) inherits the parent span context +``` + +This pattern enables a single trace to span from the CI trigger event through build, test, deploy, and smoke-test verification steps. Cross-ref `../SKILL.md` for the TRACEPARENT env var description in the skill architecture. + +--- + +## 11. Matrix Coverage + +These cells from `../matrix.md` are primarily driven by this file: + +| Layer | Boundary | Signal | Symbol | Primary artifact | +|-------|----------|--------|--------|-----------------| +| mesh | cross-application | traces | ✅ | W3C `traceparent` in mesh proxy; zero-code injection | +| L7-application | cross-application | traces | ✅ | OTel SDK W3C propagation; DDD `service.namespace` grouping | +| L7-application | cross-application | privacy | ✅ | Baggage trust-boundary filtering at API gateway | +| L7-application | cross-application | audit | ✅ | Correlation across audit events via `trace_id` + `request_id` | +| L3-network | cross-application | traces | ⚠️ | L3 packets carry no trace context; tagging at egress only | +| L4-transport | cross-application | traces | ⚠️ | TCP is not trace-native; trace context begins at mesh or L7 | + +--- + +## 12. Anti-Patterns + +The following extend `../anti-patterns.md §Cross-application`. + +| Anti-pattern | Consequence | Remediation | +|---|---|---| +| Mixed propagators without normalization at ingress | Broken traces at mesh or cloud boundary; spans appear as disconnected root spans | Apply composite propagator at ingress gateway; normalize to W3C downstream | +| Baggage carrying PII crossing a trust boundary | W3C Baggage spec §Security violation; PII exposed to untrusted downstream services | Strip baggage at egress gateway; use allowlist; never put PII in baggage | +| `request_id` not exposed to frontend | Customer support cannot correlate user-reported errors to backend traces | Return `x-request-id` in every response; display in error banners | +| `service.namespace` not set | 100+ services appear as a flat unordered list; domain-level KPIs are impossible | Set `service.namespace` per DDD bounded context in OTel resource attributes | +| B3-only propagation without W3C fallback | Traces break when crossing any non-Zipkin boundary (GCP, AWS, Azure) | Add `tracecontext` as primary propagator; keep `b3multi` as secondary for legacy | +| New `trace_id` generated on DLQ replay | Original failure span is orphaned; forensic chain from root cause to replay is severed | Re-inject original `traceparent` and `causation_id` from failed message headers | + +--- + +## References + + + +Internal cross-references: + +- `../standards.md` — normative semconv stability tiers and W3C spec versions +- `../matrix.md` — full 112-cell coverage map (cross-application boundary row) +- `../layers/mesh.md` — propagator headers per mesh; baggage scrubbing at ingress gateway +- `../signals/traces.md` — OTel span data model, messaging span links, baggage security +- `../signals/privacy.md` — PII classification, baggage PII redaction rules +- `../signals/audit.md` — audit event correlation across application boundaries +- `../transport/collector-topology.md` — multi-cluster and multi-region Collector bridge patterns +- `../incident-forensics.md` — request_id to trace_id lookup flow (MRA section) + +### Primary sources + +- W3C Trace Context Level 1 Recommendation: <https://www.w3.org/TR/trace-context/> +- W3C Trace Context Level 2 Candidate Recommendation: <https://www.w3.org/TR/trace-context-2/> +- W3C Baggage Recommendation: <https://www.w3.org/TR/baggage/> +- OTel Baggage API specification: <https://opentelemetry.io/docs/specs/otel/baggage/api/> +- AWS X-Ray trace header: <https://docs.aws.amazon.com/xray/latest/devguide/xray-concepts.html#xray-concepts-tracingheader> +- GCP Cloud Trace context propagation: <https://cloud.google.com/trace/docs/trace-context> +- Datadog trace context propagation: <https://docs.datadoghq.com/tracing/trace_collection/trace_context_propagation/> +- Linkerd distributed tracing: <https://linkerd.io/2.15/features/distributed-tracing/> diff --git a/.agents/skills/oma-observability/resources/boundaries/multi-tenant.md b/.agents/skills/oma-observability/resources/boundaries/multi-tenant.md new file mode 100644 index 0000000..a42f679 --- /dev/null +++ b/.agents/skills/oma-observability/resources/boundaries/multi-tenant.md @@ -0,0 +1,308 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +specs: + - "W3C Baggage: Recommendation 2022-12-22" + - "GDPR: Regulation (EU) 2016/679; KR PIPA: amended 2023" +notes: + - "OpenCost: CNCF Incubating (advanced 2024-10-31)" +--- + +# Multi-Tenant Observability + +## 1. Scope + +Multi-tenant observability covers the collection, routing, isolation, attribution, and residency of telemetry signals in a B2B SaaS platform where multiple customer tenants share underlying infrastructure. + +**In scope:** +- Tenant isolation strategy across four tiers (soft / routing / dedicated-collector / dedicated-backend) +- Tenant attribute propagation via W3C Baggage +- Per-tenant sampling policies +- Per-tenant retention schedules +- Cost attribution per tenant (chargeback / showback) +- Data residency routing for GDPR (EU) and PIPA (KR) tenants + +**Out of scope (related but distinct):** +- Cross-service propagation mechanics — see `cross-application.md` (propagators, baggage rules) +- FinOps unit economics and OpenCost metric surface — see `../signals/cost.md` +- PII redaction and anonymization rules per tenant — see `../signals/privacy.md` + +--- + +## 2. OTel Attribute Conventions for Tenant + +OpenTelemetry Semantic Conventions do not include a stable `tenant.*` group as of semconv 1.27.0. The attributes below are **custom application-defined** attributes. Follow the naming convention from `../standards.md §3` and prefix with the domain noun to avoid collisions with future OTel semconv additions. + +| Attribute | Type | Example | Status | +|-----------|------|---------|--------| +| `tenant.id` | string | `"acme-corp"` | Custom (not OTel Stable) — use consistently across all signals | +| `tenant.tier` | string enum | `"free"` / `"pro"` / `"enterprise"` | Custom — drives sampling and routing decisions | +| `tenant.region` | string | `"eu-west-1"` / `"ap-northeast-2"` | Custom — drives data residency routing | + +**Naming rationale:** dot-separated namespace (`tenant.*`) matches OTel semconv style and avoids the underscore ambiguity common in Prometheus label names. Do not use `customer_id`, `org_id`, or `account_id` for the same concept — pick one canonical key and propagate it everywhere. + +--- + +## 3. Four-Tier Isolation Strategy + +Most B2B SaaS organizations apply a mix of tiers: enterprise tenants get Tier 3 or 4, pro gets Tier 2, and free gets Tier 1. Select the highest tier required by the tenant's compliance obligations. + +| Tier | Description | Relative Cost | Isolation Strength | Compliance Fit | +|------|-------------|---------------|--------------------|----------------| +| 1. Soft | Shared collector + shared backend; tenants are separated only by `tenant.id` label filtering in dashboards and queries | Low | Weak — no pipeline isolation; noisy-neighbor risk | Basic B2B without data separation requirements | +| 2. Routing | Shared collector pool; `routing_connector` or `tail_sampling` sub-policies split pipelines by tenant tier; still shared backend | Medium | Medium — pipeline isolation; shared storage | Regulated tiers with data processing agreements | +| 3. Dedicated collector | Per-tenant collector instance in a dedicated Kubernetes namespace; isolates ingestion and processing; shared or per-region backend | High | Strong — ingestion isolated; namespace-level blast radius | Enterprise tenants, HIPAA, ISO 27001 requirements | +| 4. Dedicated backend | Per-tenant observability backend project or account (e.g., separate Grafana org, separate Datadog account, separate GCP project) | Highest | Strongest — full stack isolation from ingestion to storage | Highest compliance obligations (FedRAMP, SOC 2 Type II per tenant, GDPR Art. 28 sub-processor separation) | + +**Routing connector alpha caveat:** Tier 2 using `routing_connector` is subject to the alpha stability warning documented in `../transport/sampling-recipes.md §4`. For production Tier 2 deployments, prefer `tail_sampling` with `and` sub-policies (stable) over `routing_connector` (alpha as of 2025). + +--- + +## 4. Tenant ID Propagation + +Tenant context must be carried across service boundaries so every span, metric, and log record emitted by any service is attributable to its originating tenant. + +**Mechanism:** W3C Baggage (`baggage` header, Recommendation 2022-12-22). Set `tenant.id` and `tenant.tier` as baggage entries at the ingress gateway. All downstream services read from the OTel Baggage API and apply these values as resource or span attributes. + +``` +Ingress Gateway + → set baggage: tenant.id=acme-corp, tenant.tier=enterprise + ↓ +Service A (reads baggage → sets span attribute tenant.id) + ↓ +Service B (reads baggage → sets span attribute tenant.id) +``` + +**Trust-boundary warning:** W3C Baggage is visible to every service in the propagation chain, including third-party or external services. The W3C Baggage specification (https://www.w3.org/TR/baggage/) explicitly notes that baggage values cross trust boundaries. Carrying `tenant.id` to external egress endpoints leaks customer account existence information. + +Rule: strip or validate `tenant.*` baggage entries at the egress gateway before forwarding to any external third-party endpoint. Internal propagation only. Cross-ref `../signals/privacy.md §Common PII in Telemetry` for baggage PII rules. + +--- + +## 5. Per-Tenant Sampling + +Different tenant tiers justify different sampling rates. Enterprise tenants have SLA obligations and full debugging requirements; free tenants justify only ambient visibility. + +**Per-tier retention targets:** + +| Tenant Tier | Trace Sampling Rate | Rationale | +|-------------|---------------------|-----------| +| `enterprise` | 100% | SLA obligations, full debugging capability, compliance audit trail | +| `pro` | 20% | Representative sample, cost-controlled | +| `free` | 2% | Ambient visibility only | + +**Recommended configuration — `tail_sampling` with `and` sub-policies (stable, production-safe):** + +```yaml +processors: + tail_sampling: + decision_wait: 30s + num_traces: 100000 + expected_new_traces_per_sec: 1000 + policies: + - name: enterprise + type: and + and: + and_sub_policy: + - name: tier-check + type: string_attribute + string_attribute: + key: tenant.tier + values: ["enterprise"] + - name: probabilistic + type: probabilistic + probabilistic: + sampling_percentage: 100 + + - name: pro + type: and + and: + and_sub_policy: + - name: tier-check + type: string_attribute + string_attribute: + key: tenant.tier + values: ["pro"] + - name: probabilistic + type: probabilistic + probabilistic: + sampling_percentage: 20 + + - name: free-baseline + type: probabilistic + probabilistic: + sampling_percentage: 2 # catches free tier and unmatched traffic +``` + +Cross-ref `../transport/sampling-recipes.md §4` for the full tenant-aware sampling recipe including `routing_connector` Option A (alpha) and the combined error + cost + tenant four-policy example. + +--- + +## 6. Per-Tenant Retention + +Retention schedules must be enforced per tier. Hot storage is fast-query; warm is compressed but queryable; cold is archival with retrieval latency. + +| Tier | Hot | Warm | Cold | +|------|-----|------|------| +| Enterprise | 90 days | 1 year | 3 years | +| Pro | 30 days | 90 days | — | +| Free | 7 days | 30 days | — | + +**Implementation:** Apply Kubernetes-native or backend-native lifecycle policies keyed on the `tenant.id` and `tenant.tier` labels. For shared backends (Tier 1–2), use label-based TTL rules or index lifecycle management (e.g., OpenSearch ISM policies, Loki retention rules, Thanos compactor retention). For dedicated backends (Tier 3–4), set backend project-level retention per tenant. + +Cross-ref `../meta-observability.md §Retention Matrix` for the full retention policy table covering all seven signals. + +--- + +## 7. Cost Attribution + +Tenant-level cost attribution enables chargeback (billing tenants for their resource consumption) and showback (internal reporting without billing). + +**Kubernetes workload labeling:** Tag every pod at deploy time with `tenant.id` as a Kubernetes label. OpenCost reads workload labels and produces `opencost_workload_cost_total{tenant_id="acme-corp"}` automatically. + +```yaml +# Kubernetes Pod template label (applied via Helm values or admission webhook) +metadata: + labels: + tenant.id: "acme-corp" + tenant.tier: "enterprise" +``` + +**Telemetry cost attribution:** The observability bill itself (e.g., Datadog RUM sessions per tenant, Honeycomb events per tenant) must also be attributed. Instrument the collector pipeline with throughput counters per tenant to apportion the observability bill proportionally. + +**Cardinality constraint:** `tenant.id` used as a metric label must be capped at the top-N active tenant count (100–1,000 tenants is safe for most TSDBs). Beyond this threshold, bucket overflow tenants under an `"other"` label to prevent metric series explosion. Cross-ref `../meta-observability.md §Cardinality Guardrails`. + +Cross-ref `../signals/cost.md §4 Cost Attribution by Dimension` for FinOps unit economics (per-tenant PromQL formulas, OpenCost architecture, and FOCUS spec). + +--- + +## 8. Data Residency + +GDPR Chapter V (https://gdpr-info.eu/chapter-5/) restricts transfers of personal data outside the EU/EEA to countries or organizations that provide adequate protection. Korean PIPA (https://www.pipa.go.kr) applies equivalent restrictions for Korean resident data. + +**Routing rules:** + +| Tenant Region | Collector Placement | Backend Placement | Cross-Region Allowed? | +|---------------|--------------------|--------------------|----------------------| +| EU (`eu-*`) | EU-region edge collector | EU-region backend only | No — GDPR Chapter V | +| KR (`ap-northeast-2`) | KR-region edge collector | KR-region backend only | No — PIPA | +| US (`us-*`) | US-region collector | US-region or global backend | Yes (to non-EU/KR) | +| Other | Regional or global collector | Regional or global backend | Yes (check bilateral agreements) | + +**Topology:** deploy per-region edge collectors that aggregate locally and export only to backends in the same region. No cross-region OTLP export for EU or KR tenants. + +``` +EU Tenants → EU Edge Collector → EU Backend (e.g., eu-west-1 Grafana Cloud) +KR Tenants → KR Edge Collector → KR Backend (e.g., ap-northeast-2 region) +US Tenants → US Edge Collector → US Backend or global aggregator +``` + +Route by `tenant.region` at the ingress gateway before data enters the collector pipeline. Do not allow EU or KR tenant telemetry to flow through a non-compliant region, even transiently. + +**Source-of-truth rule (critical)**: `tenant.region` MUST be resolved from an internal, server-side authoritative source — tenant registry service, organization metadata table, or IdP claim stamped at session start. It MUST NOT be trusted from client-supplied input (HTTP header, baggage, query string, or JWT claim the client itself controls). A misconfigured or malicious tenant could otherwise self-declare a non-EU/KR region and bypass residency routing. Enforce at the ingress gateway: reject requests where a client-declared `tenant.region` disagrees with the registry lookup keyed on `tenant.id`. In practice, strip any inbound `tenant.region` attribute and re-attach the registry-sourced value before the Collector pipeline accepts the span/log. + +Cross-ref `../transport/collector-topology.md §7 Federated / Multi-Cluster` for the multi-region edge topology diagram. Cross-ref `../signals/privacy.md §2 Regulatory Drivers` for GDPR and PIPA penalty context and `../signals/privacy.md §Cross-border transfer` for PII-specific cross-border rules. + +--- + +## 9. Tenant Onboarding and Offboarding + +**Onboarding checklist:** +1. Provision `tenant.id` and `tenant.tier` as Kubernetes labels on all tenant workloads (via Helm values or admission webhook). +2. Create per-tenant routing rule if Tier 2+. +3. Provision per-tenant collector namespace if Tier 3+. +4. Create per-tenant backend project or organization if Tier 4. +5. Create per-tenant dashboard folder with RBAC rules (see §10). +6. Register tenant in cardinality allowlist (top-N cap enforcement). + +Automate steps 1–6 as code; cross-ref `../observability-as-code.md` for provisioning patterns. + +**Offboarding — GDPR Art. 17 Right to Erasure:** + +When a tenant terminates their contract, all telemetry data containing `tenant.id` must be deleted across every storage tier (hot, warm, cold) and every signal (metrics, logs, traces, profiles, cost records, audit records). This is a legal obligation under GDPR Art. 17, not an engineering convenience. + +Offboarding procedure: +1. Trigger deletion job across all backends scoped to `tenant.id`. +2. Remove tenant from cardinality allowlist and routing rules. +3. Deprovision collector namespace (Tier 3) or backend account (Tier 4). +4. Emit an audit event recording the erasure action, timestamp, and operator identity. + +Cross-ref `../signals/audit.md` for audit event schema for offboarding erasure events. + +--- + +## 10. Dashboard Isolation + +| Tier | Dashboard Isolation Mechanism | +|------|-------------------------------| +| Tier 1 | Grafana folder per tenant; dashboard variables filter by `tenant.id` label | +| Tier 2 | Grafana folder per tenant; Grafana RBAC restricts folder access by team | +| Tier 3 | Grafana organization per tenant; or Honeycomb environment per tenant | +| Tier 4 | Dedicated backend instance; tenant admin is org owner in their own account | + +**RBAC rule:** a tenant admin identity claim must be mapped to a Grafana or backend role that scopes data access strictly to that tenant's `tenant.id`. Cross-org or cross-tenant data leakage via dashboard query is a compliance violation. + +Cross-ref `../signals/privacy.md §Backend RBAC` for Grafana RBAC configuration patterns and OPA policy rules for query-level tenant isolation. + +--- + +## 11. Noisy Neighbor Protection + +Shared infrastructure (Tier 1 and Tier 2) is vulnerable to one high-volume tenant degrading the observability pipeline for all other tenants. + +**Controls:** + +| Control | Scope | Mechanism | +|---------|-------|-----------| +| Per-tenant ingress rate limit | Collector receiver level | `ratelimiter` extension (alpha) or `filter` processor tied to per-tenant token-bucket state; pair with `memory_limiter` as backpressure | +| Per-tenant cardinality quota | Metrics pipeline | Top-N series cap per `tenant.id`; overflow bucketed as `"other"` | +| Circuit breaker on ingress | Collector pipeline | `memory_limiter` processor per-tenant pipeline; shed load when memory exceeds threshold | +| Per-tenant queue depth limit | Exporter queue | `sending_queue` max size per tenant pipeline (Tier 2 routing) | + +Apply rate limits at the first collector tier (agent or edge). A tenant exceeding its quota must receive a well-defined error (e.g., OTLP `ResourceExhausted` gRPC status) rather than silently dropping data. + +--- + +## 12. Matrix Cells — Multi-Tenant Row + +Quick navigation for multi-tenant boundary cells in `../matrix.md`: + +| Layer | Signal | Status | Artifact | +|-------|--------|--------|----------| +| L3-network | metrics | ✅ | VPC flow logs per tenant; egress bytes attributed by source CIDR mapped to `tenant.id` | +| L7-application | traces | ✅ | W3C Baggage carries `tenant.id`; all spans tagged at ingress | +| L7-application | cost | ✅ | OpenCost workload attribution via `tenant.id` pod label; per-tenant PromQL aggregation | +| L7-application | privacy | ✅ | Per-tenant PII redaction rules; per-tenant residency routing | +| L7-application | audit | ✅ | Per-tenant audit trail; erasure events on offboarding | + +--- + +## 13. Anti-Patterns + +The following are candidates for `../anti-patterns.md §Multi-Tenant`: + +| Anti-Pattern | Impact | Correction | +|-------------|--------|------------| +| `tenant.id` as metric label without top-N cap | Cardinality explosion in TSDB; ingestor OOM; query timeouts for all tenants | Enforce top-N cap (100–1,000); bucket overflow as `"other"`; cross-ref `../meta-observability.md §Cardinality Guardrails` | +| `tenant.id` in W3C Baggage crossing trust boundaries | Tenant account existence leaks to third-party services; GDPR personal data transfer without legal basis | Strip `tenant.*` baggage at egress gateway before forwarding to any external endpoint | +| Shared backend for regulated tiers (Tier 1 for compliance tenants) | Co-mingled data violates data processing agreements; one breach affects all tenants | Upgrade regulated tenants to Tier 3 or 4; apply isolation tier based on contractual obligation, not cost convenience | +| Cross-region OTLP export for EU or KR tenants | GDPR Chapter V violation; personal data transfer to non-adequate third country; regulatory fine risk | Route EU/KR telemetry to region-local backends exclusively; enforce at ingress gateway by `tenant.region` | +| No tenant offboarding erasure process | GDPR Art. 17 violation; deleted tenant data persists in hot/warm/cold tiers and backup snapshots | Implement automated erasure job scoped by `tenant.id` across all storage tiers; emit audit event per erasure | + +--- + +## Cross-References + +| Topic | File | +|-------|------| +| Baggage propagation mechanics and trust-boundary rules | `cross-application.md` | +| Full FinOps cost attribution and OpenCost metric surface | `../signals/cost.md` | +| PII redaction, anonymization, and GDPR/PIPA regulatory detail | `../signals/privacy.md` | +| Tenant-aware sampling recipes (routing_connector, tail_sampling) | `../transport/sampling-recipes.md §4` | +| Multi-cluster and regional collector topology | `../transport/collector-topology.md §7` | +| Retention matrix for all seven signals | `../meta-observability.md §Retention Matrix` | +| Cardinality guardrails and top-N cap | `../meta-observability.md §Cardinality Guardrails` | +| Dashboard RBAC and query-level tenant isolation | `../signals/privacy.md §Backend RBAC` | +| Audit event schema for offboarding erasure | `../signals/audit.md` | +| Observability-as-code provisioning for tenant onboarding | `../observability-as-code.md` | +| Full 112-cell coverage matrix | `../matrix.md` | diff --git a/.agents/skills/oma-observability/resources/boundaries/release.md b/.agents/skills/oma-observability/resources/boundaries/release.md new file mode 100644 index 0000000..76b75a3 --- /dev/null +++ b/.agents/skills/oma-observability/resources/boundaries/release.md @@ -0,0 +1,372 @@ +# Release Boundary + +## 1. Scope + +The release boundary is the **temporal boundary between versions of a service** — the window +where new code enters production and either stabilizes or is rolled back. + +Covers: progressive delivery strategies, Flagger canary analysis, Argo Rollouts, OpenFeature +feature flags, GitOps engine reconcile observability, and Kubernetes operator reconcile metrics. + +Out of scope: SLI/SLO definitions and error budget math (see `slo.md`); dashboard layout +and code (see `../observability-as-code.md`); post-incident timelines (see `../incident-forensics.md`). + +This boundary **consumes** SLI/SLO metrics as promotion gates; it does not define SLOs. + +--- + +## 2. Progressive Delivery Strategies + +| Strategy | Traffic split | Blast radius | Metric feedback speed | When to use | +|----------|--------------|-------------|----------------------|-------------| +| **Blue/Green** | Full switch | High | Fast (all traffic) | Low-frequency, reversible deploys | +| **Canary** | Gradual % shift | Low to high | Progressive | Most stateless services | +| **A/B testing** | Cohort split | Low | Slow (UX signal) | Product/UX variant evaluation | + +Choose based on: blast radius tolerance × metric feedback speed. Canary is the default +recommendation for services with Prometheus SLI coverage. + +--- + +## 3. Flagger (CNCF Graduated, part of Flux) + +Source: <https://flagger.app> + +Flagger automates canary promotion and rollback decisions using Prometheus, Datadog, or +NewRelic metric queries. It integrates natively with Flux and supports any service mesh or +ingress (Istio, Linkerd, Nginx, Contour). + +**Feedback loop:** Flagger samples metrics on a configurable interval (30s–60s). If the +metric breaches the threshold for N consecutive samples (`failureThreshold`), it rolls back. +Promotion requires all metric checks to pass for the full `iterations` count. + +```yaml +# Flagger Canary CR — canary analysis referencing PromQL SLI +apiVersion: flagger.app/v1beta1 +kind: Canary +metadata: + name: checkout + namespace: prod +spec: + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: checkout + progressDeadlineSeconds: 600 + service: + port: 8080 + analysis: + interval: 60s # sample every 60 seconds + threshold: 5 # max failed checks before rollback + maxWeight: 50 # max canary traffic weight (%) + stepWeight: 10 # increment per successful check + metrics: + - name: error-rate + templateRef: + name: error-rate + namespace: flagger-system + thresholdRange: + max: 1 # fail if error rate > 1% + interval: 60s + - name: latency-p99 + templateRef: + name: latency + namespace: flagger-system + thresholdRange: + max: 500 # fail if p99 > 500ms + interval: 60s +--- +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: error-rate + namespace: flagger-system +spec: + provider: + type: prometheus + address: http://prometheus.monitoring:9090 + query: | + sum(rate(http_requests_total{ + namespace="{{ namespace }}", + service="{{ target }}", + status=~"5.." + }[{{ interval }}])) + / + sum(rate(http_requests_total{ + namespace="{{ namespace }}", + service="{{ target }}" + }[{{ interval }}])) * 100 +``` + +Cross-ref `slo.md §3` for the full SLI PromQL patterns used as the canary threshold source. + +--- + +## 4. Argo Rollouts + +Source: <https://argoproj.github.io/rollouts> + +Argo Rollouts integrates with Argo CD for GitOps-native progressive delivery. Supports +canary, blue-green, and experiment steps. `AnalysisTemplate` CRD gates promotion with +metric-based pass/fail. + +```yaml +# Argo Rollout — canary with AnalysisTemplate reference +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: checkout +spec: + replicas: 5 + strategy: + canary: + steps: + - setWeight: 20 + - pause: { duration: 2m } + - analysis: + templates: + - templateName: error-rate-check + - setWeight: 50 + - pause: { duration: 5m } + - analysis: + templates: + - templateName: error-rate-check + canaryService: checkout-canary + stableService: checkout-stable +--- +apiVersion: argoproj.io/v1alpha1 +kind: AnalysisTemplate +metadata: + name: error-rate-check +spec: + metrics: + - name: error-rate + interval: 60s + successCondition: result[0] < 1.0 # < 1% error rate + failureLimit: 3 + provider: + prometheus: + address: http://prometheus.monitoring:9090 + query: | + sum(rate(http_requests_total{ + job="checkout-canary", + status=~"5.." + }[5m])) + / + sum(rate(http_requests_total{ + job="checkout-canary" + }[5m])) * 100 +``` + +--- + +## 5. Feature Flags via OpenFeature (CNCF Graduated, 2024-11) + +Source: <https://openfeature.dev> | CNCF graduation: <https://www.cncf.io/projects/openfeature/> + +OpenFeature defines a vendor-agnostic SDK specification for feature flag evaluation via the +**OFREP** (OpenFeature Remote Evaluation Protocol). The SDK core is provider-agnostic; +teams swap providers without changing application code. + +**Supported providers:** LaunchDarkly, Flagsmith, GrowthBook, Unleash, ConfigCat, Harness FF. + +**Evaluation context** carries targeting attributes: + +| Attribute | Example | Use | +|-----------|---------|-----| +| `user.id` | `usr_abc123` | Per-user targeting | +| `tenant.id` | `org_xyz` | Tenant-gated rollout | +| `environment` | `production` | Environment guard | +| `app.version` | `2.4.1` | Version-gated flag | + +Integration with Flagger/Rollouts: use feature flags to gate traffic routing at the +application layer while Flagger controls infrastructure-level traffic weight. + +--- + +## 6. Observing Feature Flag Evaluations + +OTel semantic conventions: `feature_flag.*` (Experimental as of semconv 1.27.0). + +Key span attributes emitted on each flag evaluation: + +``` +feature_flag.key = "checkout-v2-enabled" +feature_flag.variant = "on" +feature_flag.provider_name = "flagsmith" +``` + +Example span attributes (JSON log-compatible): + +```json +{ + "name": "feature_flag.evaluation", + "attributes": { + "feature_flag.key": "checkout-v2-enabled", + "feature_flag.variant": "on", + "feature_flag.provider_name": "flagsmith", + "user.id": "usr_abc123", + "tenant.id": "org_enterprise" + } +} +``` + +**Metrics to track per flag:** + +| Metric | Labels | Purpose | +|--------|--------|---------| +| `feature_flag_evaluation_total` | `flag`, `variant`, `tenant` | Variant distribution | +| Error rate delta per variant | `flag`, `variant` | A/B regression detection | +| Latency p99 delta per variant | `flag`, `variant` | Performance regression | + +Dashboards cross-ref `../observability-as-code.md` for the flag evaluation panel template. + +--- + +## 7. GitOps Engines + +### 7.1 Argo CD + +Source: <https://argo-cd.readthedocs.io> + +Declarative Kubernetes sync using app-of-apps pattern. Reconcile metrics exposed via `/metrics`: + +| Metric | Type | Description | +|--------|------|-------------| +| `argocd_app_info` | Gauge | App metadata; sync status label | +| `argocd_app_sync_total` | Counter | Sync operations by phase | +| `argocd_app_reconcile_bucket` | Histogram | Reconcile latency distribution | + +Drift detection alert (cluster state ≠ git manifest): + +```promql +argocd_app_info{sync_status="OutOfSync"} == 1 +``` + +### 7.2 Flux (CNCF Graduated) + +Source: <https://fluxcd.io> — Flagger ships as part of the Flux ecosystem. + +| Metric | Type | Description | +|--------|------|-------------| +| `gotk_reconcile_duration_seconds` | Histogram | Per-controller reconcile latency | +| `gotk_reconcile_condition` | Gauge | `ready` or `stalled` per resource | + +Stalled resource alert: + +```promql +gotk_reconcile_condition{type="Ready",status="False"} == 1 +``` + +Cross-ref `../signals/metrics.md` for Prometheus scrape config snippets for both engines. + +--- + +## 8. Kubernetes Operator Reconcile Observability + +Every operator built on `controller-runtime` exposes `/metrics` automatically. + +**Built-in metrics:** + +| Metric | Type | Description | +|--------|------|-------------| +| `controller_runtime_reconcile_total` | Counter | Reconcile attempts by result | +| `controller_runtime_reconcile_errors_total` | Counter | Failed reconcile loops | +| `controller_runtime_reconcile_time_seconds` | Histogram | Reconcile loop duration | + +Monitor CRD readiness via `.status.conditions[]`: + +```promql +# Alert on reconcile error spike +rate(controller_runtime_reconcile_errors_total[5m]) > 0.1 +``` + +Custom CRDs must emit reconcile metrics. Pattern: wrap reconcile logic in `ObservedGeneration` +guard and expose a `status.conditions` entry per managed resource. + +--- + +## 9. Release Markers in Telemetry + +- `service.version` (OTel Stable) — set on every span, metric data point, and log record + via the OTel Resource at SDK initialization. Never patch per-signal. +- Emit a **deployment event** at release time: + +```json +{ + "event.name": "deployment", + "service.name": "checkout", + "service.version": "2.4.1", + "deployment.environment": "production", + "timestamp": "2026-04-21T10:00:00Z" +} +``` + +- Pipeline this event to Grafana → vertical annotation line on all timeseries dashboards. +- **Correlation rule:** incidents within ≤30 minutes of a deployment event are flagged as + release suspects. Cross-ref `../incident-forensics.md §Scenario C` for the triage playbook. + +--- + +## 10. Canary Analysis Metric Suite + +| Signal | Source | Gate | +|--------|--------|------| +| SLI compliance | `slo.md` PromQL | Pass/fail per iteration | +| Error rate delta (new vs prev) | `http_requests_total` | Max +0.5% allowed | +| Latency p99 delta | `http_request_duration_seconds_bucket` | Max +50ms allowed | +| Crash-Free Rate delta | Cross-ref `../layers/L7-application/crash-analytics.md` | Mobile/web only | +| Conversion / revenue delta | Product analytics | Product team domain; not in Flagger | + +--- + +## 11. Rollback and Hotfix Pipeline + +| Trigger | Mechanism | Action | +|---------|-----------|--------| +| Metric breach (N consecutive) | Flagger / Argo Rollouts automated | Instant rollback to stable | +| Manual override | `kubectl argo rollouts abort` or Flagger annotation | Operator-initiated | +| Hotfix deploy | Standard pipeline, skip canary analysis steps if authorized | Requires audit entry | + +Manual override always remains possible regardless of automated analysis state. + +Post-mortem audit trail required for every rollback event. +Cross-ref `../signals/audit.md` for the required audit log fields. + +--- + +## 12. Matrix.md Cells (release row) + +| Layer | Signal | Status | Detail | +|-------|--------|--------|--------| +| L7 × release | metrics | ✅ | Canary SLI delta; error rate and latency per version | +| L7 × release | logs | ✅ | Deployment events with `service.version` | +| L7 × release | traces | ✅ | `service.version` tagging on every span | +| L7 × release | profiles | ⚠️ | Regression comparison v(new) vs v(prev); tooling-dependent | +| L7 × release | cost | ⚠️ | Cost delta per variant requires OpenCost label propagation | +| mesh × release | metrics | ⚠️ | Canary routing rules observable via Envoy stats if mesh present | + +--- + +## 13. Anti-Patterns (candidates for `../anti-patterns.md §Section E Release & Deployment`) + +| Anti-pattern | Problem | Fix | +|-------------|---------|-----| +| No release markers in telemetry | Cannot correlate deploys to incidents | Set `service.version` on OTel Resource; emit deployment event | +| Canary analysis without SLI metric | Promotion is blind; regressions ship silently | Add MetricTemplate/AnalysisTemplate referencing PromQL SLI | +| Feature flag evaluation not observed | Variant effect invisible; A/B regression undetected | Emit `feature_flag.*` span attributes on every evaluation | +| GitOps drift unalerted | Ghost state; cluster diverges from git manifest silently | Alert on `argocd_app_info{sync_status="OutOfSync"}` or `gotk_reconcile_condition{status="False"}` | +| Rollback without post-mortem audit trail | No learning loop; same failure repeats | Record rollback event in audit log; cross-ref `../signals/audit.md` | + +--- + +## References + +- Flagger: <https://flagger.app> +- Argo Rollouts: <https://argoproj.github.io/rollouts> +- OpenFeature: <https://openfeature.dev> +- CNCF OpenFeature incubating announcement: <https://cncf.io/blog/2023/12/> +- Flux (includes Flagger): <https://fluxcd.io> +- Argo CD: <https://argo-cd.readthedocs.io> +- OTel feature_flag semconv: <https://opentelemetry.io/docs/specs/semconv/feature-flags/> +- controller-runtime metrics: <https://book.kubebuilder.io/reference/metrics> diff --git a/.agents/skills/oma-observability/resources/boundaries/slo.md b/.agents/skills/oma-observability/resources/boundaries/slo.md new file mode 100644 index 0000000..6554066 --- /dev/null +++ b/.agents/skills/oma-observability/resources/boundaries/slo.md @@ -0,0 +1,214 @@ +# SLO Boundary + +## 1. Scope + +SLO (Service Level Objective) is the contractual reliability target bounding acceptable error rates +and response times. Covers: SLI definition, SLO math, error budget, burn-rate alerts, tool options. + +Out of scope: dashboard config (`observability-as-code.md`); progressive delivery gates (`release.md`); +pipeline SLOs (`../meta-observability.md §Section F`). + +--- + +## 2. Terminology + +| Term | Definition | Example | +|------|-----------|---------| +| **SLI** | Measurable property of a service | 99.2% of requests returned 2xx in past 30d | +| **SLO** | Internal reliability target | 99.9% availability, 28-day rolling window | +| **SLA** | Customer contract with consequences | SLO − buffer; breach triggers credits | +| **Error budget** | `100% − SLO`; headroom before breach | 0.1% = 43.2 min/month at 99.9% SLO | + +Rule: SLA target < SLO target. Never set SLA = SLO (no operations buffer). + +--- + +## 3. SLI Selection + +| Method | Best for | Signals | +|--------|---------|---------| +| Golden Signals (Google SRE) | Any service | Latency, Traffic, Errors, Saturation | +| RED | Request-serving | Rate, Errors, Duration | +| USE | Resources | Utilization, Saturation, Errors | + +Cross-ref `../signals/metrics.md §SLI` for PromQL sketches per method. + +--- + +## 4. SLO Math + +Availability SLI (28-day rolling window): + +```promql +sum(rate(http_requests_total{status=~"2..|3.."}[28d])) + / sum(rate(http_requests_total[28d])) +``` + +Latency SLI (fraction of requests under 300ms): + +```promql +sum(rate(http_request_duration_seconds_bucket{le="0.3"}[28d])) + / sum(rate(http_request_duration_seconds_count[28d])) +``` + +--- + +## 5. OpenSLO Spec + +Source: <https://openslo.com> — community-driven, not CNCF. Vendor-neutral YAML adopted by Sloth, Pyrra, Nobl9. + +```yaml +apiVersion: openslo.com/v1 +kind: SLO +metadata: + name: checkout-availability +spec: + service: checkout + sloType: Request-Based + indicator: + spec: + ratioMetric: + good: + metricSource: + type: Prometheus + spec: + query: sum(rate(http_requests_total{service="checkout",status=~"2..|3.."}[{{.Window}}])) + total: + metricSource: + type: Prometheus + spec: + query: sum(rate(http_requests_total{service="checkout"}[{{.Window}}])) + objectives: + - target: 0.999 + timeWindow: + - duration: 28d + isRolling: true +``` + +--- + +## 6. Tool Options (as of 2026-Q2) + +| Tool | Type | CNCF | Output | +|------|------|------|--------| +| Sloth | OSS CLI + k8s operator | No | PrometheusRule CRDs | +| Pyrra | OSS CLI + k8s operator | No | PrometheusRule CRDs | +| Grafana SLO | Grafana Cloud product | No | Cloud-native alerts | +| Nobl9 | Commercial SaaS | No | Multi-backend | +| Google Cloud Service Monitoring | GCP-native | No | GCP alerts | + +Cross-ref `../vendor-categories.md §OSS Full-Stack`. Sloth and Pyrra are the recommended OSS path +for Kubernetes-native GitOps deployments. + +--- + +## 7. Burn-Rate Alert Design + +### 7.1 Problem + +A simple error-budget-exhausted alert fires after damage is done. Multi-window burn-rate alerts +detect fast-burning incidents while they still have budget remaining. + +Source: <https://sre.google/workbook/alerting-on-slos/> + +### 7.2 Multi-Window Tiers (99.9% SLO) + +| Tier | Budget consumed | Long window | Short window | Multiplier | Action | +|------|----------------|------------|-------------|-----------|--------| +| Fast burn | 2% in 1h | 1h | 5m | 14.4× | Page immediately | +| Slow burn | 5% in 6h | 6h | 30m | 6× | Create ticket | + +**Multiplier derivation:** `budget_fraction / (window / 720h)` +- Fast: `2% / (1h/720h) = 14.4` +- Slow: `5% / (6h/720h) = 6` + +### 7.3 Fast Burn PromQL + +```promql +( + (1 - sum(rate(http_requests_total{status=~"2..|3.."}[1h])) + / sum(rate(http_requests_total[1h]))) + > (1 - 0.999) * 14.4 +) +and +( + (1 - sum(rate(http_requests_total{status=~"2..|3.."}[5m])) + / sum(rate(http_requests_total[5m]))) + > (1 - 0.999) * 14.4 +) +``` + +### 7.4 Slow Burn PromQL + +```promql +( + (1 - sum(rate(http_requests_total{status=~"2..|3.."}[6h])) + / sum(rate(http_requests_total[6h]))) + > (1 - 0.999) * 6 +) +and +( + (1 - sum(rate(http_requests_total{status=~"2..|3.."}[30m])) + / sum(rate(http_requests_total[30m]))) + > (1 - 0.999) * 6 +) +``` + +The short window gates the long window: long window detects sustained burns; short window +suppresses false positives from transient spikes. + +--- + +## 8. Error Budget Policy + +| Budget remaining | Action | +|-----------------|--------| +| > 50% | Normal feature velocity | +| 25–50% | Review reliability vs feature ratio | +| < 25% | Reliability sprint | +| 0% (exhausted) | Freeze deploys until budget recovers | + +Cross-ref `release.md` — Flagger and Argo Rollouts use the SLI error rate as canary promotion gate; +the same threshold enforces budget-aware deployment freeze. + +--- + +## 9. Cross-Integration + +| File | Integration | +|------|------------| +| `release.md` | Flagger/Argo use SLI metric for canary promotion; SLO failure = rollback | +| `observability-as-code.md` | OpenSLO YAML in Git; Sloth/Pyrra generate PrometheusRule CRDs in CI | +| `../meta-observability.md §Section F` | Pipeline has its own burn-rate alerts for computation lag | + +--- + +## 10. Matrix Coverage (slo row) + +| Layer | Signal | Status | Detail | +|-------|--------|--------|--------| +| L7-application | metrics | ✅ | SLI calculation from HTTP/gRPC counters and histograms | +| L7-application | logs | ⚠️ | Burn-rate source when metrics unavailable; higher latency | +| L7-application | traces | ⚠️ | Critical path traces complement metrics-based SLI | +| mesh | metrics | ✅ | Golden signals from Envoy; zero-code instrumentation | + +--- + +## 11. Anti-Patterns (candidates for `../anti-patterns.md §Section D`) + +| Anti-pattern | Problem | Fix | +|-------------|---------|-----| +| SLO without burn-rate alert | Alert fires after budget exhausted | Add multi-window PrometheusRule | +| Burn-rate without multi-window | False alarms (short) or slow detection (long) | Gate long window with short window | +| SLO without error budget policy | No action framework for budget consumption | Define policy table (§8) | +| Customer SLA = SLO | No buffer; any breach triggers penalty | Set SLA target below SLO target | + +--- + +## References + +- Google SRE Workbook — Alerting on SLOs: <https://sre.google/workbook/alerting-on-slos/> +- OpenSLO specification: <https://openslo.com> +- Sloth: <https://github.com/slok/sloth> +- Pyrra: <https://github.com/pyrra-dev/pyrra> +- Grafana SLO plugin: <https://grafana.com/grafana/plugins/grafana-slo-app> diff --git a/.agents/skills/oma-observability/resources/checklist.md b/.agents/skills/oma-observability/resources/checklist.md new file mode 100644 index 0000000..c6c8336 --- /dev/null +++ b/.agents/skills/oma-observability/resources/checklist.md @@ -0,0 +1,134 @@ +# oma-observability Checklist + +> Consolidated verification checklist. Run before shipping observability changes to production. +> Each item: action — criterion — priority. +> Source files live under `resources/` from the skill root. + +## Priority legend + +- **P0**: block ship — do not merge or deploy without this +- **P1**: must-before-prod — complete before promoting to production +- **P2**: this-sprint — schedule and close within the current sprint + +--- + +## Section 1 — Setup validation (pre-commit / pre-merge) + +- [ ] Pin OTel spec version and semconv version in `standards.md` header — must match deployed SDK version (P0) +- [ ] Define `service.name`, `service.namespace`, `service.version` on every instrumented service OTel Resource — all three attributes present and non-empty (P0) +- [ ] Define `deployment.environment`, `cloud.provider`, `cloud.region`, `cloud.availability_zone` on every OTel Resource — required for region-dimension pivot in incident forensics (P0) +- [ ] Define `host.id`, `k8s.pod.name`, `k8s.node.name`, `k8s.cluster.name`, `container.id` on every Kubernetes workload Resource — required for server-dimension pivot (P0) +- [ ] Configure W3C Trace Context (`traceparent`) as default propagator — every outbound HTTP/gRPC call must forward the header; confirm by checking `trace_id` consistency across service log streams (P0) +- [ ] Verify W3C Baggage carries only allowed keys — `tenant.id`, feature flag state, deployment SHA, region hint are allowed; user email, session tokens, and credentials are prohibited per `standards.md §W3C Baggage` (P0) +- [ ] Strip or validate inbound baggage at ingress gateway — external callers must not inject arbitrary baggage into internal services (P0) +- [ ] Use only stable semconv groups (`service.*`, `host.*`, `cloud.*`, `k8s.*`, `http.*`, `db.*`, `network.*` core, `error.*`) for production SLO inputs — Development-tier attributes (`tls.*`, `network.connection.*`) must not feed SLOs (P0) +- [ ] Configure `memory_limiter` processor as the first processor in every Collector pipeline — `limit_percentage: 75`, `spike_limit_percentage: 20` (P0) +- [ ] Place `memory_limiter` before `batch` in pipeline order — prevents OOM crash under burst (P0) +- [ ] Validate that OTLP transport choice matches topology — gRPC port 4317 for pod-to-pod; HTTP port 4318 for browser SDKs and proxy-traversal paths (P1) +- [ ] Verify UDP StatsD datagrams fit within path MTU — max 1472 B on standard Ethernet IPv4; use Unix Domain Socket (`unixgram`) for same-host paths (P1) +- [ ] Version all observability artifacts in git — dashboards, alert rules, SLO definitions, and Collector configs must be applied via CI/CD, not edited in the UI (P1) + +--- + +## Section 2 — Pre-production readiness + +- [ ] Confirm matrix coverage gaps are identified and acknowledged — every uncovered cell in `matrix.md` has an explicit N/A rationale or a remediation plan (P1) +- [ ] Set cardinality budget per service — alert at 80% of budget (default 5 000 series/service); configure `count({job="<service>"}) > 4000` alert (P1) +- [ ] Allow-list metric attributes explicitly in the OTel SDK View — use `attribute_keys` to prevent unbounded label sets; never let raw `http.url`, `user.id`, `request.id`, `trace.id`, or `error.message` appear as metric labels (P0) +- [ ] Normalize high-cardinality route labels — apply `transform` processor to replace numeric path segments with `/_` (e.g., `/users/42` → `/users/_`) (P1) +- [ ] Define per-tenant sampling policy when multi-tenant — enterprise tenants may require 100% error retention + dedicated pipeline; free-tier tenants use probabilistic baseline (P1) +- [ ] Configure four-tier tenant isolation strategy — select Tier (soft label / routing / dedicated-collector / dedicated-backend) per tenant compliance obligation and document the decision (P1) +- [ ] Propagate `tenant.id` via W3C Baggage on all spans, logs, and metrics in multi-tenant services — use the canonical key `tenant.id`; do not use `customer_id`, `org_id`, or `account_id` as synonyms (P1) +- [ ] Verify `tenant.region` drives data residency routing — EU and KR tenants must have telemetry routed to compliant regional Collector pipelines before reaching a cross-region backend (P0) +- [ ] Configure two-tier Collector topology for Kubernetes — DaemonSet agent (hostmetrics, filelog, kubeletstats, k8sattributes) forwarding to Deployment gateway (batch, tail_sampling, exporters) (P1) +- [ ] Deploy loadbalancing exporter upstream of tail_sampling processor — consistent hash by `trace_id` ensures complete traces arrive at the same gateway replica (P0) +- [ ] Confirm `exception.type`, `exception.message`, `exception.stacktrace`, `code.function`, `code.filepath`, `code.lineno` are populated on every ERROR span — use `span.recordException(e)` for atomicity (P0) +- [ ] Confirm IP address logging is masked or hashed before long-term retention — IP addresses are personal data under GDPR Art. 4(1) and PIPA; apply prefix truncation or HMAC+salt at pipeline ingress (P0) + +--- + +## Section 3 — Production operations + +- [ ] Scrape OTel Collector self-metrics (`otelcol_*`) from a separate Prometheus instance or second Collector — Collector failures must not destroy their own observability (P0) +- [ ] Alert on pipeline delivery ratio dropping below 99% — `(rate(otelcol_exporter_sent_spans[5m]) / rate(otelcol_receiver_accepted_spans[5m])) < 0.99` sustained for 5 minutes (P0) +- [ ] Alert on exporter send failures above 1% — `rate(otelcol_exporter_send_failed_spans[5m]) / rate(otelcol_exporter_sent_spans[5m]) > 0.01` for 5 minutes (P0) +- [ ] Alert on `otelcol_receiver_refused_spans > 0` sustained for 2 minutes — indicates queue full or parse failures (P0) +- [ ] Alert on Collector heap usage exceeding 75% of container memory limit — `otelcol_process_runtime_heap_alloc_bytes > 0.75 * container_limit` (P0) +- [ ] Configure NTP or chrony on all host VMs and Kubernetes nodes — clock drift must stay below 100 ms for reliable trace waterfall ordering (P0) +- [ ] Alert on node clock drift exceeding 100 ms — `node_clock_drift_ms > 100` for 5 minutes triggers waterfall inversion risk (P0) +- [ ] Emit `node_clock_drift_ms` from each host via chrony textfile collector or node exporter — required for the clock drift alert to fire (P1) +- [ ] Configure tail_sampling policy with 100% error + 100% high-latency + 5-10% probabilistic baseline — gateway Collector only; `decision_wait: 30s` to buffer all spans (P1) +- [ ] Apply exporter retry configuration — `initial_interval: 5s`, `max_interval: 30s`, `max_elapsed_time: 300s`, `queue_size: 1000` to handle transient backend unavailability without infinite queue growth (P1) +- [ ] Scrape Fluent Bit self-metrics at `:2020/api/v1/metrics/prometheus` — monitor `fluentbit_output_errors_total` alongside `otelcol_*` for unified pipeline health view (P1) +- [ ] Publish a meta-observability Grafana dashboard — must include pipeline delivery ratio panels, receiver accepted vs refused, exporter sent vs failed, queue depth, heap usage, clock drift heatmap, and top cardinality metrics table (P1) +- [ ] Enforce retention policy per signal — metrics: 15d full-res / 90d 5m / 2y 1h; operational logs: 7d / 30d / 90d; sampled traces: 30d; profiles: 14d (P1) + +--- + +## Section 4 — Incident forensics readiness + +- [ ] Verify all MRA resource attributes are present on every signal before production — `service.name`, `service.namespace`, `service.version`, `deployment.environment`, `cloud.*`, `host.id`, `k8s.*`, `container.id` (P0) +- [ ] Confirm `trace_id` appears on every structured log record — log-trace join is impossible without it; set via OTel SDK current span context (P0) +- [ ] Confirm `span_id` appears on every structured log record alongside `trace_id` — required for sub-trace log correlation (P0) +- [ ] Inject `request_id` at the API gateway — propagate as `x-request-id` header; log on every record; expose to end user for support tickets (P1) +- [ ] Verify metric exemplars are enabled and linked to `trace_id` — required for metric-to-trace pivot in Step 2 of the 6-dimension narrowing flow (P1) +- [ ] Confirm the 6-dimension narrowing flow can execute in under 15 minutes — run a tabletop drill: symptom capture → trace_id acquisition → region → server → service → layer → code → cross-signal validation (P1) +- [ ] Validate vendor query patterns are documented for all backends in use — Honeycomb, Datadog, Grafana Tempo, Jaeger, Sentry, Elastic query syntax must be accessible during an incident (P1) +- [ ] Test log-trace join by `trace_id` across at least two services in the production environment — the join must return matching records within the same request's span (P1) +- [ ] Confirm release marker log events include `service.name`, `service.version`, `deployment.environment`, and deployment SHA — without markers, Step 5 release correlation in the forensics playbook is blind (P0) + +--- + +## Section 5 — Compliance & Audit + +- [ ] Store audit logs in WORM immutable storage — S3 Object Lock, GCS Object Hold, or Azure Immutable Blob Storage; minimum retention 7 years (P0) +- [ ] Configure audit log retention alert — `audit_log_retention_days < 2555` (7 years) triggers a critical alert via scheduled compliance check (P0) +- [ ] Include all six mandatory audit event categories — authentication, authorization, data access, administrative, security events, system events — before claiming SOC 2 or ISO 27001 compliance (P0) +- [ ] Carry required attributes on every audit event — `user.id` (pseudonymized), `actor.type`, `action`, `resource.type`, `resource.id`, `event.outcome` (P0) +- [ ] Implement PII redaction at pipeline ingestion — apply Collector `transform` processor to scrub or hash PII fields before they reach any storage backend (P0) +- [ ] Enforce GDPR storage limitation — raw personal data must not exceed the retention period justified by the processing purpose; default operational logs ≤ 7 days raw (P0) +- [ ] Separate audit and privacy pipelines — same-pipeline storage would create contradictory retention requirements (WORM vs. erasable); keep distinct pipelines per `signals/audit.md` Design Decision D5 (P0) +- [ ] Sign a Data Processing Agreement (DPA) with every observability vendor that receives personal data — required before routing production telemetry to SaaS backends (P0) +- [ ] Enforce cross-region data residency for EU and KR tenants — EU tenant telemetry must not transit or rest outside EEA without adequacy decision; KR tenants subject to PIPA § 29 (P0) +- [ ] Store pseudonymization keys separately from pseudonymized telemetry data — GDPR Art. 32 requires independent access control and audit trail for the key store (P1) +- [ ] Verify `user.id` and `user.email` are never used as metric labels — cardinality explosion and PII in TSDB are a double violation; use `filter` processor to drop any datapoint carrying these attributes (P0) + +--- + +## Section 6 — SLO & Release gate + +- [ ] Define SLO with error budget policy for every user-facing service — SLO target, SLA target (SLA < SLO), 28-day rolling window, and burn-rate alert thresholds (P1) +- [ ] Implement multi-window burn-rate alerts — fast burn: 2% budget in 1 hour; slow burn: 5% budget in 6 hours; both required to avoid alert fatigue and missed slow leaks (P1) +- [ ] Store SLO definitions in OpenSLO YAML in git — treated as code; PR review required for any SLO target change (P1) +- [ ] Select SLI source from stable signals — mesh RED metrics or L7 application metrics preferred; log-based SLI is acceptable only as a fallback for un-instrumented services (P1) +- [ ] Configure canary analysis (Flagger or Argo Rollouts) to use SLI metrics as promotion gates — success rate threshold and latency p99 threshold both required (P1) +- [ ] Add memory saturation metric as a custom Flagger analysis gate — OOMKilled pods may not fail the success rate threshold before crashing; `container_memory_working_set_bytes` gate prevents silent OOM canary progression (P1) +- [ ] Emit `service.version` on all metrics, logs, and spans at deploy time — required for before/after comparison in canary analysis and in Step 3c of incident forensics (P0) +- [ ] Emit a structured release marker event at every deployment — must carry `service.name`, `service.version`, `deployment.environment`, deployment strategy, and deployment SHA (P0) +- [ ] Record SLO burn-rate threshold crossings as immutable audit events — provides compliance evidence for availability breach; cross-ref `signals/audit.md` (P1) +- [ ] Define rollback criteria and document the rollback procedure — include metric recovery verification steps in the runbook (P1) + +--- + +## Section 7 — Recovery + +- [ ] Document Collector queue backpressure remediation runbook — steps: identify via `otelcol_processor_queued_retry_send_queue_length`, scale gateway Collector replicas, check vendor rate limits (P1) +- [ ] Document clock drift remediation procedure — `chronyc makestep` for immediate re-sync; escalation to PTP for sub-millisecond requirements; check NTP source reachability if drift persists (P1) +- [ ] Document cardinality bomb mitigation procedure — identify offending metric with `topk(10, count({__name__!=""}) by (__name__))`, add `filter` processor to drop offending label, or `transform` processor to remove the label value (P1) +- [ ] Document Collector OOM recovery path — verify `memory_limiter` is configured; if OOM still reached, increase container memory limit or reduce `limit_percentage`; never disable `memory_limiter` (P1) +- [ ] Document vendor fallback plan for observability backend outage — identify secondary backend or local Collector buffering strategy; `sending_queue` with `queue_size` provides short-term buffering (P2) +- [ ] Document PII incident response procedure — GDPR Art. 33 breach notification to supervisory authority within 72 hours; identify affected telemetry stores, scope the affected data subjects, and engage DPO (P0) +- [ ] Document audit log retention violation response — verify S3/GCS Object Lock policy; confirm `retention_period` in log backend matches 7y; restore from backup if records were prematurely deleted (P0) +- [ ] Test Flagger auto-rollback by injecting a synthetic error rate above the failure threshold in a staging environment — verify rollback completes before the `progressDeadlineSeconds` expires (P2) +- [ ] Validate that `memory_limiter` refused spans alert fires before Collector OOM — run a controlled load test; confirm `otelcol_processor_memory_limiter_refused_spans > 0` alert fires and data loss stops short of crash (P2) + +--- + +## Contribution Protocol + +- Append items under the section that best fits the intent (setup / pre-prod / ops / forensics / compliance / SLO / recovery) +- Format: `- [ ] {action verb} {specific target} — {acceptance criterion} (priority: P0/P1/P2)` +- Include a `cross-ref` comment pointing to the source file using a path relative to the `resources/` root when the item derives from a specific section +- Mark priority: P0 (block ship) | P1 (must before prod) | P2 (this sprint) +- Deduplicate before adding — search existing items by keyword before inserting +- CTO review required for any change to P0 items in Sections 1, 5, or 6 diff --git a/.agents/skills/oma-observability/resources/examples.md b/.agents/skills/oma-observability/resources/examples.md new file mode 100644 index 0000000..3e3ab25 --- /dev/null +++ b/.agents/skills/oma-observability/resources/examples.md @@ -0,0 +1,449 @@ +# oma-observability Examples + +> End-to-end walkthroughs demonstrating the skill's value. +> Each scenario ties multiple files together into a single executable story. + +--- + +## Scenario 1 — Greenfield: OSS Full-Stack on Kubernetes + +**Situation:** Series A startup, new k8s cluster. Team wants OTel-native metrics + logs + traces + Grafana +dashboards at zero licensing cost. + +**Intent:** `setup` + +**Files referenced:** `vendor-categories.md §(a)`, `transport/collector-topology.md §2`, `observability-as-code.md §5`, `checklist.md` + +### Walkthrough + +1. Invoke `/oma-observability "set up OTel stack on k8s"`. +2. Intent classifier routes to `setup`. +3. `vendor-categories.md §(a) OSS Full-Stack` selects **Grafana LGTM+** (Mimir, Loki, Tempo, Grafana) + for a team already familiar with Grafana dashboards. +4. `transport/collector-topology.md §2 Two-Tier Hybrid` determines the deployment pattern: + DaemonSet agent per node + Deployment gateway. + +**Agent CRD — DaemonSet (node-level collection):** + +```yaml +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-agent + namespace: observability +spec: + mode: daemonset + config: + receivers: + hostmetrics: + collection_interval: 30s + scrapers: { cpu: {}, memory: {}, filesystem: {} } + filelog: + include: [/var/log/pods/*/*/*.log] + include_file_path: true + kubeletstats: + collection_interval: 20s + auth_type: serviceAccount + processors: + memory_limiter: # mandatory: first processor in every pipeline + limit_mib: 400 + check_interval: 1s + k8sattributes: + extract: + metadata: [k8s.pod.name, k8s.namespace.name, k8s.node.name] + exporters: + otlp: + endpoint: otel-gateway.observability.svc:4317 + service: + pipelines: + metrics: + receivers: [hostmetrics, kubeletstats] + processors: [memory_limiter, k8sattributes] + exporters: [otlp] + logs: + receivers: [filelog] + processors: [memory_limiter, k8sattributes] + exporters: [otlp] +``` + +**Gateway CRD — Deployment (batching, `mode: deployment`, 2 replicas):** + +```yaml +# Key fields only — full config in observability-as-code.md §5 +spec: + mode: deployment + replicas: 2 + config: + receivers: + otlp: { protocols: { grpc: { endpoint: 0.0.0.0:4317 } } } + processors: + memory_limiter: { check_interval: 1s, limit_percentage: 75 } + batch: {} + exporters: + otlphttp/tempo: { endpoint: http://tempo.observability.svc:4418 } + prometheusremotewrite: { endpoint: http://mimir.observability.svc:9009/api/v1/push } + loki: { endpoint: http://loki.observability.svc:3100/loki/api/v1/push } + service: + pipelines: + traces: { receivers: [otlp], processors: [memory_limiter, batch], exporters: [otlphttp/tempo] } + metrics: { receivers: [otlp], processors: [memory_limiter, batch], exporters: [prometheusremotewrite] } + logs: { receivers: [otlp], processors: [memory_limiter, batch], exporters: [loki] } +``` + +5. Collector config is committed to git and applied via Argo CD app-of-apps per `observability-as-code.md §7`. +6. Validation checklist (from `checklist.md §1, §3`): + - [ ] `memory_limiter` is the first processor in every pipeline + - [ ] NTP drift < 100 ms on all nodes (`chronyc tracking`) + - [ ] Cardinality budget set before first data ingest + - [ ] Gateway has `≥ 2` replicas with PodDisruptionBudget + +**Outcome:** 9 pods, 2 services, first trace visible in Grafana Tempo within 15 minutes of `kubectl apply`. + +--- + +## Scenario 2 — Incident Forensics: Payment Service 5xx Spike + +**Situation:** "ap-northeast-2 결제 서비스 5xx spike at 14:20 UTC" — alert fires with no `trace_id` +provided. On-call SRE must localize root cause in under 15 minutes. + +**Intent:** `investigate` + +**Files referenced:** `incident-forensics.md §3–§5 Scenario A`, `boundaries/slo.md §7`, `signals/traces.md`, `signals/logs.md §7`, `boundaries/release.md` + +### Walkthrough + +1. **Alert source:** multi-window burn-rate alert from `boundaries/slo.md §7` fires — 2 % budget + consumed in 1 h at 14.4× rate. `severity: critical`. +2. Invoke `/oma-observability --investigate "5xx spike in ap-northeast-2"`. +3. Router routes to `incident-forensics.md §3 Six-Dimension Narrowing Flow`. + +**Step 1 — Acquire `trace_id` via metric exemplar (`incident-forensics.md §2`):** + +```promql +histogram_quantile(0.99, + sum by (le, service_name) ( + rate(http_server_request_duration_seconds_bucket{ + deployment_environment="prod", + cloud_region="ap-northeast-2" + }[5m]) + ) +) +# Click a spike data point → Grafana Mimir exemplar panel shows trace_id link +``` + +Exemplar retrieved: `trace_id = 9a3f1c8e2b7d4e50a1c3f9d2b8e7a4c0`. + +**Step 2 — Six-dimension narrowing:** + +| Dimension | Finding | +|-----------|---------| +| Region | `cloud.region = ap-northeast-2` only. `ap-southeast-1` healthy. Single-region blast radius. | +| Server | Errors on 3 of 6 pods, all on `k8s.node.name = ip-10-0-2-11.ec2.internal`. | +| Service | Earliest ERROR span: `service.name = payments-checkout`, `service.version = v2.4.1`. | +| Layer | `span.kind = CLIENT`, `db.system = redis` — L7 DB call, not a network-layer issue. | +| Code | `exception.type = TimeoutException`, `exception.message = "Redis pool exhausted after 5000ms"`, `code.function = processPayment`, `code.lineno = 287`. | + +**Step 3 — Cross-signal validation (`incident-forensics.md §4`):** + +``` +# Grafana Tempo — TraceQL lookup +{ trace:id = "9a3f1c8e2b7d4e50a1c3f9d2b8e7a4c0" } + +# Filter logs by trace_id to find correlated warnings +# Loki LogQL +{service_name="payments-checkout"} | json | trace_id="9a3f1c8e2b7d4e50a1c3f9d2b8e7a4c0" +``` + +Logs reveal: `WARN: connection pool at capacity (256/256)` at 14:21. Redis +`redis_pool_active_connections / redis_pool_max_connections` metric at 100 %. + +**Step 4 — Release correlation:** + +`service.version = v2.4.1` deployed at 13:57 UTC — 25 minutes before incident. +Changelog: Redis connection pool reduced from 512 → 256 in this version. + +**Step 5 — Remediation:** + +Flagger canary analysis `successRate` already < 99 % → auto-rollback to `v2.4.0` triggers +per `boundaries/release.md §Flagger Canary CR`. Error rate returns to baseline in 3 minutes. +Deployment event chain preserved in `signals/audit.md` for post-mortem. + +--- + +## Scenario 3 — Multi-Tenant Cost Attribution + +**Situation:** B2B SaaS now has 50+ tenants. CFO asks: "Which tenants cost most to serve, and +how does our observability bill break down per customer?" + +**Intent:** `route` + `tune` + +**Files referenced:** `boundaries/multi-tenant.md §7`, `signals/cost.md §3–§4`, `vendor-categories.md §(f)`, `meta-observability.md §Cardinality Guardrails` + +### Walkthrough + +1. Tag every pod at deploy time with `tenant.id` — applied via Helm values or admission webhook + per `boundaries/multi-tenant.md §7`: + +```yaml +metadata: + labels: + tenant.id: "acme-corp" + tenant.tier: "enterprise" +``` + +2. Deploy OpenCost (CNCF Incubating) with Prometheus scrape per `signals/cost.md §3`: + +```yaml +# OTel Collector gateway — OpenCost scrape +receivers: + prometheus: + config: + scrape_configs: + - job_name: opencost + scrape_interval: 60s + static_configs: + - targets: ["opencost.opencost.svc.cluster.local:9003"] +``` + +3. Per-tenant cost PromQL (daily, `signals/cost.md §5`): + +```promql +sum by (tenant_id) ( + increase(opencost_workload_cost_total[24h]) + * on(namespace) group_left(tenant_id) + kube_namespace_labels +) +``` + +4. Cardinality guard — keep top-1 000 tenants labeled, bucket overflow as `"other"` to prevent + TSDB series explosion (`meta-observability.md §Cardinality Guardrails`, `boundaries/multi-tenant.md §13`): + +```yaml +# OTel Collector transform processor — cardinality cap +processors: + transform/cardinality: + metric_statements: + - context: datapoint + statements: + - set(attributes["tenant_id"], "other") + where attributes["tenant_id"] not in ${env:TOP_TENANT_LIST} +``` + +5. Per-tier sampling policy at the gateway to control observability bill per tenant + (`boundaries/multi-tenant.md §5`): `enterprise` = 100 %, `pro` = 20 %, `free` = 2 %. + Full `tail_sampling` YAML (with `and` sub-policies) lives in `transport/sampling-recipes.md §4`. + +6. Include the observability bill itself: instrument the collector pipeline with throughput counters + per `tenant.id` to apportion Grafana Cloud / Datadog usage proportionally + (`signals/cost.md §4`, `vendor-categories.md §(f)`). + +**Outcome:** Identified 3 free-tier tenants consuming 40 % of egress cost. Applied egress rate +limit at collector ingress (`boundaries/multi-tenant.md §11 Noisy Neighbor Protection`). +Those tenants were moved to a stricter free-tier cap, reducing monthly egress spend by 35 %. + +--- + +## Scenario 4 — Migrating off Fluentd to Fluent Bit + +**Situation:** Legacy Fluentd DaemonSet is consuming 400 MB RAM per node vs. Fluent Bit's typical +80 MB. The CNCF 2025-10 migration guide is now the normative reference. Team needs zero-downtime +migration with log parity verification before decommission. + +**Intent:** `migrate` + +**Files referenced:** `vendor-categories.md §(h)`, `signals/logs.md §6`, `transport/collector-topology.md §2`, `observability-as-code.md §3` + +### Walkthrough + +1. Invoke `/oma-observability --migrate "Fluentd to Fluent Bit"`. +2. Router routes to `vendor-categories.md §(h) Log Pipeline` → **Fluent Bit** (CNCF Graduated, + drop-in config compatibility with Fluentd, C runtime, 5–15 MB RAM typical). + +**Migration phases (zero-downtime):** + +**Phase 1 — Dual-send:** Deploy Fluent Bit DaemonSet alongside the existing Fluentd DaemonSet. +Fluent Bit ships logs to the same backend; Fluentd continues as primary. Confirm both streams +reach the backend without duplicates by comparing log record counts. + +```ini +# Fluent Bit DaemonSet — core pipeline (fluent-bit.conf) +[INPUT] + Name tail + Path /var/log/pods/*/*/*.log + Parser cri + Tag kube.* + Mem_Buf_Limit 5MB + +[FILTER] + Name kubernetes + Match kube.* + Merge_Log On + Labels On + +[OUTPUT] + Name opentelemetry + Match kube.* + Host otel-gateway.observability.svc + Port 4318 + Logs_uri /v1/logs +``` + +**Phase 2 — Parity verification:** query log record count per `service.name` in both pipelines +over a 1-hour window. Expect < 0.1 % divergence accounting for timing windows. + +```promql +# Fluent Bit throughput +sum by (service_name) (rate(fluentbit_output_proc_records_total[1h])) + +# Fluentd throughput (comparison) +sum by (service_name) (rate(fluentd_output_emit_records_total[1h])) +``` + +**Phase 3 — Decommission Fluentd:** once parity is confirmed over 24 h, scale down Fluentd +DaemonSet to 0 and remove its resources. + +```bash +kubectl scale daemonset fluentd -n logging --replicas=0 +kubectl delete daemonset fluentd -n logging +``` + +3. Commit Fluent Bit ConfigMap and DaemonSet manifests to git; apply via Argo CD per + `observability-as-code.md §7`. +4. Validate with `otelcol validate` on the gateway config; run `promtool check rules` on any + log-based alerting rules. + +**Outcome:** Node-level log collection RAM drops from 400 MB (Fluentd/Ruby) to ~80 MB (Fluent Bit/C) +— a 5× reduction typical per the CNCF 2025-10 guide. CPU overhead at steady state also reduces by +~60 %. No log gaps observed during the dual-send window. + +--- + +## Scenario 5 — SLO with Burn-Rate Alerts (GitOps End-to-End) + +**Situation:** Platform team adopts the SLO practice for the checkout service. Target: 99.9 % +availability on a 28-day rolling window. Everything must be versioned in git and applied +via Argo CD — no manual UI edits. + +**Intent:** `alert` + +**Files referenced:** `boundaries/slo.md §5, §7, §8`, `observability-as-code.md §4, §6, §7`, `boundaries/release.md` + +### Walkthrough + +1. Define the SLO as OpenSLO YAML and commit to the observability repo (`boundaries/slo.md §5`): + +```yaml +# Key fields — full spec in boundaries/slo.md §5 +apiVersion: openslo.com/v1 +kind: SLO +metadata: + name: checkout-availability +spec: + service: checkout + sloType: Request-Based + objectives: + - target: 0.999 + window: 28d + indicator: + spec: + ratioMetric: + good: + metricSource: + type: Prometheus + spec: + query: > + sum(rate(http_requests_total{service="checkout",status=~"2..|3.."}[{{.Window}}])) + total: + metricSource: + type: Prometheus + spec: + query: sum(rate(http_requests_total{service="checkout"}[{{.Window}}])) +``` + +2. Generate PrometheusRule CRD via Sloth (`observability-as-code.md §4.2`): + +```bash +sloth generate -i sloth/checkout-slo.yaml | kubectl apply -f - +``` + +3. The generated `PrometheusRule` includes multi-window burn-rate alerts from `boundaries/slo.md §7` + and `observability-as-code.md §6`: + +```yaml +# Fast burn — 2 % budget in 1 h (14.4× rate), gated by 5 m short window → page +- alert: CheckoutSLOBurnRateFast + expr: | + (sum(rate(http_requests_total{service="checkout",status=~"5.."}[1h])) + / sum(rate(http_requests_total{service="checkout"}[1h]))) / (1 - 0.999) > 14.4 + and + (sum(rate(http_requests_total{service="checkout",status=~"5.."}[5m])) + / sum(rate(http_requests_total{service="checkout"}[5m]))) / (1 - 0.999) > 14.4 + for: 2m + labels: { severity: critical, slo: checkout-availability } + annotations: + summary: "Checkout SLO fast burn: budget exhausting in < 1h" + +# Slow burn — 5 % budget in 6 h (6× rate), gated by 30 m → ticket +# Full PromQL in observability-as-code.md §6 and boundaries/slo.md §7.4 +``` + +4. GitOps commit flow (`observability-as-code.md §7`): + +``` +git PR (checkout-slo.yaml + PrometheusRule) + → CI: promtool check rules alerts/checkout-slo-burn-rate.yaml + → CI: promtool test rules alerts/checkout_slo_test.yaml + → peer review (SRE CODEOWNERS approval) + → merge to main + → Argo CD sync → Prometheus Operator applies PrometheusRule to cluster +``` + +5. Validate via synthetic error injection: + +```bash +# Inject 20 % error rate for 3 minutes to trigger fast-burn alert +kubectl run error-injector --image=curlimages/curl --restart=Never -- \ + sh -c 'for i in $(seq 1 200); do curl -s -o /dev/null -w "%{http_code}" \ + http://checkout-api/error-inject; done' +``` + +Fast-burn alert fires in < 2 minutes. + +6. Error budget policy (`boundaries/slo.md §8`): + +| Budget remaining | Action | +|-----------------|--------| +| > 50 % | Normal feature velocity | +| 25–50 % | Review reliability vs feature ratio | +| < 25 % | Reliability sprint | +| 0 % exhausted | Deploy freeze until budget recovers | + +When budget reaches 0 %, Flagger canary promotion gates use the same SLI error rate to block +new deploys (`boundaries/release.md`). + +--- + +## Appendix: Vendor Query Cross-Reference + +From `incident-forensics.md §4` — trace lookup by `trace_id` across backends: + +| Vendor | Query syntax | +|--------|-------------| +| Grafana Tempo | `{ trace:id = "4bf92f3577b34da6a3ce929d0e0e4736" }` | +| Honeycomb | `trace:4bf92f3577b34da6a3ce929d0e0e4736` | +| Datadog APM | `@trace_id:4bf92f3577b34da6a3ce929d0e0e4736` | +| Jaeger HTTP API | `GET /api/traces/4bf92f3577b34da6a3ce929d0e0e4736` | +| Elastic APM | `trace.id: "4bf92f3577b34da6a3ce929d0e0e4736"` | +| Sentry | `trace_id:4bf92f3577b34da6a3ce929d0e0e4736` | + +--- + +## Contribution Protocol + +- Add new scenarios only if they demonstrate cross-file value spanning 3+ skill files. +- Each scenario must cite at least 3 other skill files by relative path from `resources/`. +- Scenarios are for human learning — keep them narrative, not exhaustive. +- Code and YAML snippets are illustrative excerpts; for authoritative full config see the + referenced files. +- Do not add future skill references — if a domain is out of scope, cite the external tool + from `SKILL.md §When NOT to use`. diff --git a/.agents/skills/oma-observability/resources/execution-protocol.md b/.agents/skills/oma-observability/resources/execution-protocol.md new file mode 100644 index 0000000..e87bbe4 --- /dev/null +++ b/.agents/skills/oma-observability/resources/execution-protocol.md @@ -0,0 +1,166 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# Observability Agent - Execution Protocol + +## Step 0: Parse Query + +1. **Extract flags** from the query string: + - `--setup`, `--migrate`, `--investigate`, `--alert`, `--trace`, `--tune`, `--route`: force specific intent + - `--strict`: require Stable semconv only — reject Development/Experimental attributes + - `--multi-tenant`, `--multi-cloud`: force boundary dimension to `multi-tenant` + - `--layer=L3|L4|mesh|L7`: force specific layer focus + - `--signal=metrics|logs|traces|profiles|cost|audit|privacy`: force signal focus +2. **Detect keywords** (Korean and English) against `resources/intent-rules.md` trigger list +3. **Extract ambient context** from user message or project files: + - Target platform: k8s, serverless (Lambda/Cloud Run), VM, bare metal + - Scale tier: single-service, multi-service, multi-region + - Compliance requirements: SOC2, ISO 27001, GDPR, PIPA +4. **Log** resolved flags, detected language, and ambient context for transparency + +## Step 1: Classify Intent + +1. If a flag is present → use that intent directly; skip keyword matching +2. If no flag → apply keyword pattern matching from `resources/intent-rules.md` +3. If ambiguous or no match → default to `investigate + tune` in parallel +4. **Sparse-context gate**: even when an intent is matched (score ≥ 1), before proceeding to Step 2 check that ambient context is sufficient for the chosen intent. Minimum required context per intent: + - `investigate`: service name OR symptom category (error code, metric name) OR time window + - `setup` / `migrate`: target platform (k8s/serverless/VM) OR language/framework + - `alert`: SLI target or service name + - `trace`: at least one boundary hint (service hop, mesh, cloud) + - `tune`: signal type (metrics/logs/traces) OR problem (cost/cardinality/MTU) + - `route`: tenant OR region OR cloud axis + + If the minimum is not present, request clarification from the user before consuming reference material. A 1-2 keyword query that hits one intent keyword but carries no context (e.g., "metrics broken", "trace 안 돼") should prompt for service/symptom/scope rather than executing a playbook that cannot succeed. +5. Log selected intent and whether selection was `flag` or `auto`, plus any clarification requested + +Intent vocabulary: + +| Intent | Meaning | +|--------|---------| +| `setup` | First-time instrumentation or pipeline deployment | +| `migrate` | Moving from a deprecated or legacy tool | +| `investigate` | Incident analysis or anomaly root-cause | +| `alert` | SLO burn-rate or threshold alert authoring | +| `trace` | Distributed trace propagation and context design | +| `tune` | Transport, cardinality, sampling, or collector optimization | +| `route` | Signal routing, tenant isolation, or topology design | + +## Step 2: Matrix Navigation + +1. Based on (intent × layer × boundary × signal), identify relevant cells in `resources/matrix.md` +2. Collect file references for each cell marked as covered (checkmark) or conditional (warning marker) +3. Flag any N/A cells the user is asking about — redirect to an alternative dimension rather than producing a stub answer +4. Record the active (layer, boundary, signal) triple for use in Step 6 output header + +## Step 3: Route Dispatch + +Dispatch based on intent. Use the table below as the primary routing map, then apply intent-specific detail. + +| Intent | Primary resource | Fallback | +|--------|-----------------|---------| +| `setup` | `resources/vendor-categories.md` → vendor-owned skill | `resources/standards.md` (OTel semconv) | +| `migrate` | CNCF 2025-10 guide + `resources/vendor-categories.md §(h) Log Pipeline` | OTel Collector bridge config | +| `investigate` | `resources/incident-forensics.md` (MRA + 6-dim localization) | `resources/signals/traces.md` + `resources/signals/logs.md` | +| `alert` | `resources/boundaries/slo.md` (burn-rate rules) | `resources/observability-as-code.md` | +| `trace` | `resources/boundaries/cross-application.md` (propagator matrix) | `resources/layers/mesh.md` (zero-code auto-instr) | +| `tune` | `resources/transport/` (4 files — see below) | `resources/meta-observability.md` (cardinality guardrails) | +| `route` | `resources/boundaries/multi-tenant.md` + `resources/transport/collector-topology.md` | `resources/boundaries/cross-application.md` | + +### setup intent +- Consult `resources/vendor-categories.md` — select category based on constraints (OSS vs commercial, high-cardinality, FinOps, profiling) +- Delegate to vendor-owned skill when one is installed (e.g., getsentry/sentry-sdk-setup, honeycombio/agent-skill, Dash0 otel-instrumentation, Datadog Labs dd-apm) +- If no matching vendor skill is installed → guide user to `/oma-search --docs` for vendor documentation + +### migrate intent +- Fluentd as source → apply CNCF 2025-10 deprecation guide; recommend Fluent Bit or OTel Collector +- Legacy APM as source → provide OTel bridge config patterns; reference `resources/vendor-categories.md §(h)` + +### investigate intent +- Invoke `resources/incident-forensics.md` full playbook (MRA + 6-dimension narrowing: code / service / layer / host / region / infra) +- Cross-reference signal files based on symptom category (latency, error rate, saturation, data loss) + +### alert intent +- Use `resources/boundaries/slo.md` for burn-rate calculation and multi-window alert rules +- Use `resources/observability-as-code.md` for PrometheusRule CRD and Alertmanager routing tree + +### trace intent +- Use `resources/boundaries/cross-application.md` for W3C Trace Context propagator matrix across cloud providers +- Use `resources/layers/mesh.md` for zero-code auto-instrumentation via service mesh + +### tune intent +- `resources/transport/udp-statsd-mtu.md` — UDP payload size thresholds and fragmentation risk +- `resources/transport/otlp-grpc-vs-http.md` — protocol selection by environment and firewall constraints +- `resources/transport/collector-topology.md` — DaemonSet vs sidecar vs gateway deployment patterns +- `resources/transport/sampling-recipes.md` — head-based vs tail-based sampling policy selection + +### route intent +- `resources/boundaries/multi-tenant.md` — tenant isolation strategies (attribute-based, pipeline-based, storage-based) +- `resources/transport/collector-topology.md` — routing topology for signal fan-out and load balancing + +## Step 4: Collect Reference Material + +1. Pull referenced file sections into working context based on Step 3 routing results +2. Check `resources/vendor-categories.md` timestamp — if older than one quarter, advise the user to verify against the CNCF landscape at https://landscape.cncf.io +3. For commercial vendor references, check whether a vendor-owned skill is installed locally before suggesting manual setup + +## Step 5: Validate Against Constraints + +1. Consult `resources/anti-patterns.md` — does the proposed approach violate any of the 18 items? +2. Consult `resources/checklist.md` — will this pass Pre-prod and Prod gates? +3. Run `resources/meta-observability.md` cardinality guardrail preview — flag any label dimension that risks unbounded growth +4. If `--strict` flag is set → reject any semconv attribute in Development or Experimental stability tier; cite stable alternative +5. If PII is involved → apply `resources/signals/privacy.md` redaction and sampling-aware baggage rules at collection, not only at storage +6. If `--multi-tenant` or `--multi-cloud` → apply `resources/boundaries/multi-tenant.md` isolation rules; verify data residency is explicit + +## Step 6: Present + +Format output as: + +``` +Intent: {intent} Mode: {auto|flag} +Layer: {L3|L4|mesh|L7} Boundary: {multi-tenant|cross-application|slo|release} Signal: {metrics|logs|traces|profiles|cost|audit|privacy} + +Primary recommendation: + - {file:section} — {1-line rationale} + +Secondary considerations: + - {file:section} — {caveat or anti-pattern hit} + +Delegation target: + - {vendor-owned skill name, or /oma-search command} + +Checklist items to verify: + - [ ] {item from checklist.md} + - [ ] {item} +``` + +Example output for "setup OTel stack on k8s": + +``` +Intent: setup Mode: auto +Layer: L7 + mesh Boundary: cross-application Signal: metrics + logs + traces + +Primary recommendation: + - resources/transport/collector-topology.md §Two-tier hybrid — DaemonSet + Deployment gateway + - resources/vendor-categories.md §OSS Full-Stack — Grafana LGTM+ (2026-Q2) + +Secondary considerations: + - resources/transport/sampling-recipes.md §Tail-based — consistent routing via loadbalancing exporter + - resources/anti-patterns.md §Section C — avoid sidecar as default on k8s; use DaemonSet + +Delegation target: + - No single vendor skill (OSS self-host); use oma-tf-infra for Terraform + /oma-search for component docs + +Checklist items to verify: + - [ ] memory_limiter processor placed before batch processor in pipeline + - [ ] NTP synced on all nodes (< 100 ms drift) + - [ ] cardinality budget set per service before enabling high-cardinality labels + - [ ] Fluentd replaced or bridged — CNCF 2025-10 deprecation in effect +``` + +## On Error + +See `resources/checklist.md §7 Recovery` for recovery steps. diff --git a/.agents/skills/oma-observability/resources/incident-forensics.md b/.agents/skills/oma-observability/resources/incident-forensics.md new file mode 100644 index 0000000..e6e1c10 --- /dev/null +++ b/.agents/skills/oma-observability/resources/incident-forensics.md @@ -0,0 +1,344 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# Incident Forensics Playbook + +## 1. Purpose + +**Goal**: given a production incident, locate root cause across 6 dimensions — code, service, layer, host, region, infra — in under 15 minutes, using the MELT+P+cost+audit+privacy signals defined across this skill tree. + +The 6-dimension narrowing flow ties every signal file together through one executable playbook. No signal exists in isolation; correlation across signals is what separates diagnosis from guesswork. + +**In scope**: any incident surfaced by an alert, a user report, or an anomaly detected in metrics, traces, or logs. +**Out of scope**: incident response workflow (on-call rotation, escalation, postmortem tooling) — use PagerDuty, OpsGenie, or Grafana OnCall for those. + +Cross-skill entry point: `oma-debug` invokes this playbook on failure, pulling traces and logs by `request_id`. See the design document Integration table. + +--- + +## 2. Minimum Required Attributes (MRA) + +Every span, log record, and metric data point MUST carry the attributes below before reaching production. Missing attributes break the narrowing flow at the step that depends on them. + +Semconv stability tiers follow `resources/standards.md §3`. Attributes marked Stable are safe for production SLOs and alerting. Attributes marked Development must not be used as SLO inputs. + +### 2.1 Resource attributes — every signal + +These attributes identify the workload that emitted the signal. They must be set on the OTel Resource, not on individual spans or log records. + +| Attribute | Semconv group | Stability | Example value | +|-----------|--------------|-----------|---------------| +| `service.name` | `service.*` | Stable | `payment-service` | +| `service.namespace` | `service.*` | Stable | `checkout` | +| `service.version` | `service.*` | Stable | `v2.4.1` | +| `deployment.environment` | `deployment.*` | Stable | `prod` | +| `cloud.provider` | `cloud.*` | Stable | `aws` | +| `cloud.region` | `cloud.*` | Stable | `ap-northeast-2` | +| `cloud.availability_zone` | `cloud.*` | Stable | `ap-northeast-2a` | +| `host.id` | `host.*` | Stable | `i-0a1b2c3d4e5f` | +| `k8s.pod.name` | `k8s.*` | Stable | `payment-7d9f4b-xk2rp` | +| `k8s.node.name` | `k8s.*` | Stable | `ip-10-0-1-45.ec2.internal` | +| `k8s.cluster.name` | `k8s.*` | Stable | `prod-ap2-cluster` | +| `container.id` | `container.*` | Stable | `a3f7c2d1e9b0...` (first 12 chars) | + +Note: for non-Kubernetes hosts use `host.id`; for Kubernetes workloads set both `k8s.pod.name` and `k8s.node.name`. `host.id` is not redundant — it maps to the underlying EC2/GCE instance for host-level pivot. + +### 2.2 Error span additional attributes + +Error spans (those where `status.code = ERROR`) MUST additionally carry code and exception attributes to enable the Code dimension pivot. + +| Attribute | Semconv group | Stability | Example value | +|-----------|--------------|-----------|---------------| +| `code.function` | `code.*` | Stable | `processPayment` | +| `code.filepath` | `code.*` | Stable | `src/payments/processor.ts` | +| `code.lineno` | `code.*` | Stable | `287` | +| `exception.type` | `exception.*` | Stable | `TimeoutException` | +| `exception.message` | `exception.*` | Stable | `Redis pool exhausted after 5000ms` | +| `exception.stacktrace` | `exception.*` | Stable | full stack trace string | + +Set these via the OTel SDK `span.recordException(e)` call, which populates all three `exception.*` attributes atomically. + +### 2.3 Structured log mandatory fields + +Every structured log record MUST carry these correlation keys. Without `trace_id`, log-trace join is impossible. Without `request_id`, user-support correlation is impossible. + +| Field | Source | Example value | +|-------|--------|---------------| +| `trace_id` | OTel SDK (current span context) | `4bf92f3577b34da6a3ce929d0e0e4736` | +| `span_id` | OTel SDK (current span context) | `00f067aa0ba902b7` | +| `request_id` | `x-request-id` header (injected at API gateway) | `req_7Kp2mNxQr` | +| `tenant.id` | W3C Baggage propagation (if multi-tenant) | `tnt_A9cBx3` | + +`tenant.id` is required only for multi-tenant services. Propagate via W3C Baggage per `resources/standards.md §W3C Baggage`. Do NOT carry user email or session tokens in baggage — see anti-patterns below. + +### 2.4 Propagation requirement + +W3C `traceparent` MUST propagate through every outbound HTTP and gRPC call. Stripping it silently is anti-pattern #1 in `resources/anti-patterns.md`. Verify propagation at every service boundary by checking that `trace_id` is consistent across services in a single request's log stream. + +--- + +## 3. Six-Dimension Narrowing Flow + +Execute the steps in order. Each step narrows the blast radius before the next. Do not skip steps — jumping to Code before confirming Region wastes time on red herrings. + +**Time budget**: 15 minutes total. Suggested split: symptom capture (1 min), trace_id acquisition (2 min), dimension pivots (8 min), cross-signal validation (3 min), release correlation + action (1 min). + +### Step 1 — Symptom capture + +Collect the following before touching any query tool: + +- **Time window**: incident start (± 2 min buffer) and current time. +- **Initial clue**: which of these does the reporter provide? + - `trace_id` — fastest path; jump directly to Step 3. + - `request_id` — search logs: `request_id = <value>` → extract `trace_id`. + - User/tenant identifier — search logs: `tenant.id = <value>` in the time window. + - Alert only — proceed to Step 2 via metrics. +- **Symptom class**: error rate spike, latency spike, OOM/crash, or silent data corruption? + +### Step 2 — Acquire trace_id + +If no `trace_id` was provided in Step 1, acquire one from metrics via exemplar: + +1. Open the alerting metric (e.g., `http_server_request_duration_seconds` p99 or `http_requests_total` error rate) for the incident time window. +2. Locate a data point near the spike peak. Most modern metric backends (Grafana Mimir, Prometheus with exemplars, Datadog APM metrics) attach a trace exemplar to high-value data points. +3. Click the exemplar → copy the `trace_id`. +4. If the backend does not expose exemplars, filter error logs by `deployment.environment = prod` and the time window, then extract `trace_id` from the first matching log record. + +### Step 3 — Narrow by dimension (coarse to fine) + +With `trace_id` in hand, execute pivots in this order: + +#### 3a. Region / Infra (`cloud.region`, `k8s.cluster.name`) + +Filter the trace or log stream by region first. Multi-region incidents are rarer than single-region ones; confirming scope is cheap. + +- Query: filter spans or logs where `cloud.region != ap-northeast-2` (or whichever region the alert fired for). If traces from other regions are healthy, the blast radius is single-region. +- If the issue is infra-wide (all regions): pivot to `k8s.cluster.name` to check whether a specific cluster is affected. + +#### 3b. Server (`host.id`, `k8s.pod.name`, `k8s.node.name`) + +Within the confirmed region, check whether the failure is pod-wide or node-wide. + +- A single pod failing repeatedly suggests a pod-level issue (OOM, file descriptor leak, bad canary rollout to that pod). +- Multiple pods on the same `k8s.node.name` suggests a node-level issue (disk pressure, noisy neighbor, kernel panic). +- Uniformly distributed across nodes suggests a service-wide or downstream issue. + +#### 3c. Service (`service.name`, `service.version`) + +With the server scope established, identify the service boundary where errors originate. + +- Inspect the trace waterfall: find the earliest span where `status.code = ERROR`. +- Record `service.name` and `service.version` from that span's resource attributes. +- If `service.version` changed recently (see Step 5), this is the primary suspect. + +#### 3d. Layer (`span.kind`, layer classification) + +Classify the failing span's network layer using `span.kind` and span name patterns: + +| `span.kind` value | Layer | Typical next pivot | +|---|---|---| +| `SERVER` or `CLIENT` with HTTP attributes | L7 application | code.function | +| `CLIENT` with `db.*` attributes | L7 DB call | traces.md DB patterns | +| `INTERNAL` with `messaging.*` | L7 messaging | check DLQ | +| eBPF-sourced span (Beyla/Pixie) | L4 transport | TCP retransmit metrics | +| Envoy/Istio proxy span | mesh | layers/mesh.md | + +For L3/L4 root causes (PMTUD black hole, TCP retransmit storm), the L7 error manifests as a connection timeout. The trace will show a CLIENT span with no child spans and a `net.peer.name` pointing to the downstream. Pivot to L3/L4 metrics from there. + +#### 3e. Code (`code.function`, `code.filepath`, `code.lineno`, `exception.stacktrace`) + +On the failing span, read the error attributes set by MRA §2.2: + +- `exception.type` and `exception.message` give the immediate cause. +- `exception.stacktrace` gives the call chain. +- `code.function` + `code.filepath` + `code.lineno` identify the exact source location without needing to search the codebase. + +Cross-check: if `exception.stacktrace` points to a library call (e.g., a Redis client timeout), the root cause is likely downstream resource exhaustion, not a code bug — continue to cross-signal validation. + +### Step 4 — Cross-signal validation + +With the failing span identified, validate the hypothesis across all available signals. Correlation is mandatory; without it, you may fix the wrong thing. + +| Signal | What to check | File reference | +|--------|--------------|----------------| +| metrics | Error rate and latency p99 trend at the confirmed `service.name` + `cloud.region`. Does the scale match the alert? | `signals/metrics.md` | +| traces | Full trace waterfall: where does latency accumulate? Where does `status.code` first become ERROR? | `signals/traces.md` | +| logs | Filter by `trace_id` across all services. Look for WARNING or ERROR logs from downstream services within ±5 seconds of the failing span's start time. | `signals/logs.md` | +| profiles | If available (Parca/Pyroscope): pull the CPU flame graph for the pod in the incident window. Look for unexpectedly hot functions or heap growth. | `signals/profiles.md` | + +All four signals should converge on the same root cause. If they diverge, re-examine the time window alignment (NTP drift is a common cause — see anti-patterns below). + +### Step 5 — Release correlation + +Compare the incident start time to recent deployments of the suspected `service.name`: + +| Window | Confidence of correlation | +|--------|--------------------------| +| `service.version` changed ≤ 30 minutes before incident | High — treat as primary hypothesis | +| ≤ 2 hours before incident | Medium — check changelog for risky changes | +| ≤ 24 hours before incident | Low — consider, but rule out other causes first | + +Release markers must be present as structured log events or metric annotations carrying `service.version`, `deployment.environment`, and the deployment SHA (propagated via `oma-scm`). Without release markers, this step is blind — see anti-patterns below. + +### Step 6 — Recover or rollback + +Based on Steps 1-5: + +- **Root cause identified, fix available**: deploy hotfix → verify metric recovery in the same time window. +- **Root cause identified, fix not immediate**: rollback to previous `service.version` → verify metric recovery. +- **Root cause identified in downstream service**: escalate to owning team with the `trace_id`, the failing span attributes, and the cross-signal evidence bundle. +- **Root cause unresolved after 15 minutes**: escalate with the narrowed blast radius (region/host/service confirmed) and all evidence collected so far. Do not continue solo investigation beyond this threshold without escalation. + +--- + +## 4. Vendor-Specific Query Examples + +Given a `trace_id` of `4bf92f3577b34da6a3ce929d0e0e4736`, the equivalent query in each backend: + +| Vendor | Interface | Query syntax | +|--------|-----------|-------------| +| Honeycomb | Query Builder | `trace:<id>` in the trace panel | +| Datadog APM | APM / Logs | `@trace_id:4bf92f3577b34da6a3ce929d0e0e4736` | +| Grafana Tempo | TraceQL | `{ trace:id = "4bf92f3577b34da6a3ce929d0e0e4736" }` | +| Jaeger | HTTP API | `GET /api/traces/4bf92f3577b34da6a3ce929d0e0e4736` | +| Sentry | Search | `trace_id:4bf92f3577b34da6a3ce929d0e0e4736` in Issues or Performance | +| SigNoz / OSS OTel | UI filter | trace_id filter field in Traces explorer | +| Elastic APM | Kibana | `trace.id: "4bf92f3577b34da6a3ce929d0e0e4736"` in Discover or APM | + +``` +# Honeycomb query builder (BubbleUp / Trace view) +trace:4bf92f3577b34da6a3ce929d0e0e4736 + +# Datadog logs query (also works in APM trace search) +@trace_id:4bf92f3577b34da6a3ce929d0e0e4736 + +# Grafana Tempo — TraceQL +{ trace:id = "4bf92f3577b34da6a3ce929d0e0e4736" } + +# Jaeger HTTP API (replace <host> with your Jaeger query host) +curl http://<host>:16686/api/traces/4bf92f3577b34da6a3ce929d0e0e4736 + +# Elastic (KQL in Discover or APM) +trace.id: "4bf92f3577b34da6a3ce929d0e0e4736" +``` + +For metric-to-trace pivot via exemplars (Step 2): + +``` +# Grafana Mimir / Prometheus with exemplars — PromQL +histogram_quantile(0.99, + sum by (le, service_name) ( + rate(http_server_request_duration_seconds_bucket{ + deployment_environment="prod", + cloud_region="ap-northeast-2" + }[5m]) + ) +) +# Click a data point spike → exemplar panel shows trace_id link +``` + +``` +# Datadog — metric to trace pivot +# In a metric graph, click a spike → "View related traces" opens APM with +# automatic time filter and service filter applied +``` + +--- + +## 5. Walkthrough Scenarios + +### Scenario A: ap-northeast-2 payment service 5xx spike + +Alert fires: `http_requests_total{status=~"5..", service_name="payment-service", cloud_region="ap-northeast-2"}` error rate crosses 5% burn-rate threshold. + +1. **Symptom capture**: time window 14:22-14:35 UTC. Alert only — no trace_id provided. Symptom class: error rate spike. +2. **Acquire trace_id**: open the alerting metric in Grafana Mimir. Click a spike data point at 14:24. Exemplar attached: `trace_id = 9a3f1c8e2b7d4e50a1c3f9d2b8e7a4c0`. +3. **Region pivot**: filter spans by `cloud.region = ap-northeast-2`. Spans from `ap-southeast-1` are healthy. Blast radius: single region. +4. **Server pivot**: error spans distributed across 3 of 6 pods, all on `k8s.node.name = ip-10-0-2-11.ec2.internal`. Potential node-level issue — note but continue to service pivot first. +5. **Service pivot**: earliest ERROR span is `service.name = payment-service`, `service.version = v2.4.1`. Downstream span to `redis-cache` shows CLIENT timeout with no response. +6. **Layer pivot**: `span.kind = CLIENT`, `db.system = redis`. Layer: L7 DB call. No L3/L4 trace artifacts; this is an application-layer Redis client timeout. +7. **Code pivot**: `exception.type = TimeoutException`, `exception.message = "Redis pool exhausted after 5000ms"`, `code.function = processPayment`, `code.filepath = src/payments/processor.ts`, `code.lineno = 287`. +8. **Cross-signal validation**: + - metrics: Redis connection pool saturation metric (`redis_pool_active_connections / redis_pool_max_connections`) at 100% from 14:21 UTC. + - logs: filter `trace_id = 9a3f1c8e2b7d4e50a1c3f9d2b8e7a4c0`. Log at 14:22: `WARN: connection pool at capacity (256/256)` from `payment-service`. + - traces: 8 concurrent `processPayment` spans all blocking on Redis CLIENT span — pool exhaustion pattern. + - profiles: not available for this service (profiling not yet instrumented). +9. **Release correlation**: `service.version = v2.4.1` deployed at 13:57 UTC — 25 minutes before incident. Release marker log entry found. Changelog shows connection pool size reduced from 512 to 256 in this version. +10. **Action**: rollback to `v2.4.0`. Error rate drops to baseline within 3 minutes. Hotfix: restore pool size to 512, add `redis_pool_wait_duration_seconds` histogram alert. + +### Scenario B: single tenant p99 latency spike + +Alert fires: p99 latency SLO burn-rate for `api-gateway` crosses 2x threshold. Reported by tenant support: `tenant.id = tnt_A9cBx3` experiencing slow responses. All other tenants are healthy. + +1. **Symptom capture**: time window 09:15-09:45 UTC. Tenant identifier provided: `tnt_A9cBx3`. Symptom class: latency spike, single tenant. +2. **Acquire trace_id**: search logs for `tenant.id = tnt_A9cBx3` in the window. First matching record: `trace_id = c7b2a1f4e8d3c9b0a2e4f1d7c3b8a5e2`. +3. **Region pivot**: `cloud.region = us-east-1` (this tenant's home region). Spans from other regions not affected. Blast radius: single region, single tenant. +4. **Server pivot**: error spans distributed evenly across all pods. Not a node-level issue. +5. **Service pivot**: latency spike originates in `service.name = tenant-config-service`, `service.version = v1.8.0` (unchanged for 2 weeks — not a release issue). +6. **Layer pivot**: `span.kind = CLIENT` with `db.system = postgresql`. Layer: L7 DB call. +7. **Code pivot**: no ERROR status — latency only. `span.duration = 812ms`. Span name: `SELECT tenant_config`. No exception attributes because the query succeeds, just slowly. +8. **Cross-signal validation**: + - metrics: `db_client_operation_duration_seconds` p99 for `tenant-config-service` at 800ms. Other services normal. + - traces: 47 sequential DB CLIENT spans within a single parent SERVER span. Each takes ~17ms. Total: 812ms. Classic N+1 query pattern. + - logs: `trace_id` filter shows no errors. Log at request start: `tenant.id = tnt_A9cBx3, config_keys_requested = 47`. + - profiles: Parca flame graph for `tenant-config-service` in the window. `tenantConfigLookup` accounts for 810ms of wall time. Call graph shows a loop issuing individual `findByKey` calls instead of a batch `findByKeys`. +9. **Release correlation**: no recent deployment. Root cause: N+1 query in `tenantConfigLookup` triggered by `tnt_A9cBx3` having 47 config keys (others have 5-12). The code path was never slow enough to surface with smaller tenants. +10. **Action**: fix `tenantConfigLookup` to use batch `SELECT ... WHERE key = ANY($1)`. Add a `db_query_count_per_request` histogram alert with threshold > 10 per span. Deploy and verify p99 for `tnt_A9cBx3` drops to < 50ms. + +Reference: `boundaries/multi-tenant.md` for per-tenant pivot strategy; `signals/profiles.md` for Parca/Pyroscope flamegraph correlation. + +### Scenario C: OOM crash after deployment + +Alert fires: pod restart storm on `rendering-service` pods. `k8s.pod.name` shows 3 of 12 pods (25%) restarting with OOMKilled reason. The 3 affected pods are canary instances (Flagger progressive delivery). + +1. **Symptom capture**: time window 16:40-17:00 UTC. Alert only. Symptom class: OOM/crash on canary pods. +2. **Acquire trace_id**: OOMKilled pods produce no useful traces after crash. Pivot to release event first. +3. **Release correlation** (Step 5 first, by exception): release marker log entry at 16:35 UTC: `service.name = rendering-service`, `service.version = v3.1.0`, `deployment.strategy = canary`. 5 minutes before OOM storm — high confidence correlation. +4. **Region/Server pivot**: `cloud.region = eu-west-1`, `k8s.node.name` spread across 3 different nodes — not a node issue. All 3 OOM pods are on `service.version = v3.1.0`. Stable pods on `v3.0.9` are healthy. +5. **Layer pivot**: OOM is a host/runtime event, not an application span. No span to inspect. Pivot directly to profiles. +6. **Cross-signal validation**: + - metrics: `container_memory_working_set_bytes` for `v3.1.0` pods ramps from 200MB to 1.5GB (container limit) over 18 minutes. `v3.0.9` pods stable at 200MB. + - traces: no useful traces from crashed pods. Pre-crash traces show `renderTemplate` CLIENT spans with duration increasing over time. + - logs: pre-crash logs from `v3.1.0` pods: `WARN: template cache size = 412MB at 16:52`. Not present in `v3.0.9` pods. + - profiles: Pyroscope heap diff between `v3.1.0` (pod alive, approaching limit) and `v3.0.9`. New allocation in `renderTemplate` retaining 400MB — a template object cache introduced in `v3.1.0` is not evicting entries. +7. **Flagger auto-rollback check**: Flagger's canary analysis `successRate` threshold is 99%. OOMKilled pods were restarting before requests failed — success rate stayed above threshold. Flagger did NOT auto-rollback. +8. **Action**: manual rollback of canary to `v3.0.9` via `kubectl argo rollouts undo rendering-service`. OOM storm stops within 2 minutes. Hotfix: add LRU eviction to template cache with `maxSize = 50MB`. Adjust Flagger analysis to include memory saturation metric as a custom metric gate. + +Reference: `boundaries/release.md` for Flagger/Argo canary analysis configuration; `signals/profiles.md` for Pyroscope heap diff workflow; `layers/L7-application/crash-analytics.md` for mobile/native crash-specific forensics. + +--- + +## 6. Integration Points + +This file does not operate in isolation. The following skill files provide the detailed implementation guidance for each step of the narrowing flow. + +| Step / Concern | Cross-reference | +|---|---| +| Per-tenant pivot (Scenario B, `tenant.id` baggage) | `boundaries/multi-tenant.md` — 4-tier isolation and per-tenant collector routing | +| Mesh trace continuity (mesh span injection, zero-code) | `layers/mesh.md` — Envoy span propagation, baggage scrubbing at ingress | +| Flamegraph acquisition (Scenario B profiles, Scenario C heap diff) | `signals/profiles.md` — Parca/Pyroscope per-tenant labeling, Pyroscope diff view | +| Canary/rollback gates (Scenario C Flagger analysis) | `boundaries/release.md` — Flagger + Argo Rollouts custom metric gates, GitOps | +| Mobile/native crash OOM forensics | `layers/L7-application/crash-analytics.md` — CFR, symbolication, crash-linked release tracking | +| Release marker injection | `boundaries/release.md` — deployment SHA → `service.version` + marker event via `oma-scm` | +| Semconv stability for MRA attributes | `resources/standards.md §3` — stability tier table | +| Full signal-layer-boundary coverage | `resources/matrix.md` — 112-cell coverage map | + +--- + +## 7. Anti-patterns + +Violating these patterns makes the narrowing flow impossible or misleading. Full list lives in `resources/anti-patterns.md`. + +| Anti-pattern | Consequence | Dimension blocked | +|---|---|---| +| Missing `trace_id` in structured logs | Log-trace join impossible; Step 4 cross-signal validation fails | All | +| `request_id` not injected at API gateway | User-support correlation impossible; Step 1 has no starting point | All | +| No release markers in log/metric stream | Step 5 release correlation is blind; rollback decision is guesswork | Service / release | +| NTP drift unmonitored on nodes | Waterfall chart timestamps are wrong; parent-before-child ordering unreliable; Step 3 pivots mislead | Layer / Code | +| PII in W3C Baggage across trust boundaries | `tenant.id` is acceptable; user email or session token in baggage is a GDPR/PIPA violation that becomes evidence in incident investigation records | All (compliance risk) | +| `user.id` or `user.email` as a metric label | High-cardinality metric explosion + PII in metric storage; cardinality guardrails in `resources/meta-observability.md` | Service | +| Canary Flagger analysis missing memory saturation gate | OOMKilled pods do not fail the success rate threshold; auto-rollback never triggers (Scenario C) | Service / infra | + +Normative anti-patterns reference: `resources/anti-patterns.md §Section Z` (items 17, 18 directly referenced above). diff --git a/.agents/skills/oma-observability/resources/intent-rules.md b/.agents/skills/oma-observability/resources/intent-rules.md new file mode 100644 index 0000000..a6a7bf2 --- /dev/null +++ b/.agents/skills/oma-observability/resources/intent-rules.md @@ -0,0 +1,325 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# Intent Classification Rules + +## Purpose + +- Classify a user query into one of 7 intents to route to the right resources +- Used by Step 1 of `execution-protocol.md` +- Bilingual keyword matching (Korean + English) for `.agents/hooks/core/triggers.json` compatibility + +## Classification Priority + +1. **Override flags** — always win, skip classification entirely (e.g., `--investigate`, `--tune`) +2. **Keyword pattern matching** — scan query for intent-specific keywords +3. **Signal detection** — contextual clues (tool names, error messages, metric names, cloud regions) +4. **Fallback** — `investigate + tune` parallel when no clear signal + +## Override Flags + +| Flag | Forced Intent | +|------|--------------| +| `--setup` | `setup` | +| `--migrate` | `migrate` | +| `--investigate` | `investigate` | +| `--alert` | `alert` | +| `--trace` | `trace` | +| `--tune` | `tune` | +| `--route` | `route` | + +--- + +## The 7 Intents + +### Intent 1: `setup` + +Bootstrap a new observability pipeline or instrument a new service from scratch. +Use when the user is starting fresh: no existing pipeline, first-time SDK integration, or onboarding a new environment. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | setup, install, bootstrap, instrument, configure, onboard, initialize, start, enable, add, integrate, begin, getting started, first time | +| Korean | 설치, 설정, 구성, 도입, 계측, 시작, 초기 구성, 셋업, 세팅, 추가, 연동, 처음, 시작하기, 온보딩 | +| Abbreviations / synonyms | init, SDK setup, OTel setup, collector setup, agent install | + +**Signals:** +- "How do I add OTel to ..." queries +- Service name + "for the first time" +- New environment or cluster name mentioned with no existing config + +#### Example Queries + +- "Set up OTel stack on our k8s cluster" +- "Instrument our Node.js backend with OTel" +- "쿠버네티스에 OTel 도입하고 싶어" +- "백엔드 계측 어떻게 시작해?" +- "How do I enable tracing on a new Spring Boot service?" + +#### Primary Route + +`resources/vendor-categories.md` → category selection; `resources/transport/collector-topology.md` for Kubernetes + +#### Secondary Considerations + +`resources/standards.md` for semconv requirements before first instrumentation commit + +--- + +### Intent 2: `migrate` + +Move from a legacy tool to a modern equivalent. Applies to logging agents, APM platforms, and deprecated CNCF projects. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | migrate, migration, transition, move, replace, upgrade, modernize, switch, deprecate, port, convert, off-board, phase out | +| Korean | 마이그레이션, 이주, 이전, 전환, 교체, 업그레이드, 현대화, 옮기기, 대체, 포팅, 전환 작업, 탈피 | +| Abbreviations / synonyms | Fluentd → Fluent Bit, legacy APM → OTel, lift-and-shift, rip-and-replace | + +**Signals:** +- Source tool name (Fluentd, New Relic, Datadog, old APM) + destination intent +- "Replace", "away from", "moving off of" +- CNCF deprecation context (Fluentd 2025-10) + +#### Example Queries + +- "Migrating Fluentd to Fluent Bit per CNCF guidance" +- "Move from New Relic to OpenTelemetry" +- "Fluentd에서 Fluent Bit으로 이주 작업 중이야" +- "기존 APM을 OTel 기반으로 전환하고 싶어" +- "How do I port StatsD metrics to OTLP?" + +#### Primary Route + +`resources/vendor-categories.md §(h) Log Pipeline` (deprecation notes); CNCF 2025-10 Fluentd migration guide + +#### Secondary Considerations + +`resources/transport/otlp-grpc-vs-http.md` for destination protocol selection during migration + +--- + +### Intent 3: `investigate` + +Production incident or bug root-cause analysis. Use when something is broken, degraded, or unexpectedly slow right now. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | investigate, debug, diagnose, incident, outage, postmortem, forensics, root-cause, why, broken, failing, spike, degraded, 5xx, error rate, latency, slow, down, regression, N+1, slow query, connection pool, pool exhaustion, db span, memory leak, OOM, flame graph, stuck | +| Korean | 조사, 분석, 장애, 원인, 부검, 디버깅, 왜, 문제, 에러, 고장, 5xx, 에러율, 지연, 느림, 다운, 장애 분석, 원인 파악, N+1, 슬로우 쿼리, 커넥션 풀, 풀 고갈, 메모리 누수, OOM, 플레임 그래프 | +| Abbreviations / synonyms | RCA, postmortem, blameless review, p99 spike, tail latency, DB timeout | + +**Signals:** +- HTTP status codes (5xx, 4xx) or error rate numbers +- Cloud region name mentioned with a problem symptom +- Service name + "suddenly", "again", "just started" +- Time window phrase ("since 14:00", "after deploy") + +#### Example Queries + +- "5xx spike in ap-northeast-2, need to find root cause" +- "Why is checkout service p99 high?" +- "특정 테넌트만 느린데 왜 그런지 조사해줘" +- "결제 서비스 에러율 갑자기 올라감, 원인 분석" +- "Auth service started timing out after today's release — help" + +#### Primary Route + +`resources/incident-forensics.md` (MRA + 6-dimension localization flow) + +#### Secondary Considerations + +`resources/signals/traces.md`, `resources/signals/logs.md`, `resources/boundaries/multi-tenant.md` (if single-tenant degradation pattern) + +--- + +### Intent 4: `alert` + +Define alerts, SLO burn-rate rules, or monitor configuration. Use when the user wants to be notified proactively before or during an incident. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | alert, alarm, notification, burn-rate, SLO, SLI, SLA, threshold, page, warn, monitor, PrometheusRule, alerting rule, error budget, firing | +| Korean | 알람, 알림, 경보, 임계치, SLO, SLA, 페이지, 경고, 번레이트, 모니터, 에러 버짓, 알람 설정, 발화 | +| Abbreviations / synonyms | burn rate alert, fast burn, slow burn, PD (PagerDuty), firing rule, recording rule | + +**Signals:** +- SLO/SLI/SLA acronyms +- "Alert when ...", "notify me if ..." +- PrometheusRule, Alertmanager, or burn-rate framing +- Error budget percentage mentioned + +#### Example Queries + +- "Set up SLO burn-rate alert for payment service" +- "Need alerts when error budget is burning fast" +- "결제 서비스 에러 버짓 번레이트 알람 설정" +- "PrometheusRule로 에러율 알람 만들어줘" +- "How do I configure a slow-burn SLO alert in Grafana?" + +#### Primary Route + +`resources/boundaries/slo.md`; `resources/observability-as-code.md` (PrometheusRule CRD, OpenSLO YAML) + +#### Secondary Considerations + +`resources/meta-observability.md §Section F` for meta-observability pipeline health alerts + +--- + +### Intent 5: `trace` + +Design or debug distributed tracing: propagators, baggage, cross-service context, or mesh trace continuity. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | trace, tracing, propagator, traceparent, tracestate, baggage, span, distributed, context propagation, correlation, W3C, X-Amzn-Trace-Id, context break, missing spans | +| Korean | 트레이싱, 추적, 전파, traceparent, 스팬, 분산, 컨텍스트, 상관관계, 전파자, 트레이스 끊김, 스팬 누락 | +| Abbreviations / synonyms | W3C Trace Context, B3, Zipkin, X-Ray trace, OpenTelemetry tracing, end-to-end trace | + +**Signals:** +- "Trace breaks", "missing parent span", "trace not showing" +- Propagator format names (W3C, B3, X-Ray, Jaeger) +- Mesh or gateway name + trace context question +- Baggage field name mentioned + +#### Example Queries + +- "Our traces break at the Istio ingress — how to propagate context?" +- "Design baggage for multi-tenant trace correlation" +- "Istio 들어가면 트레이스가 끊어짐, 전파 어떻게 해?" +- "테넌트 ID baggage로 전파할 때 주의점" +- "How do I bridge AWS X-Ray trace headers into W3C Trace Context?" + +#### Primary Route + +`resources/boundaries/cross-application.md` (propagator matrix); `resources/layers/mesh.md` (zero-code auto-instrumentation) + +#### Secondary Considerations + +`resources/signals/privacy.md` (baggage PII rules — no user identifiers in traceparent/baggage without redaction) + +--- + +### Intent 6: `tune` + +Optimize performance, reduce cost, tame cardinality, configure sampling, or fix throughput bottlenecks in the telemetry pipeline. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | tune, optimize, performance, throughput, cost, cardinality, sampling, budget, reduce, bandwidth, MTU, rate-limit, too much, expensive, overhead, head sampling, tail sampling, drop | +| Korean | 튜닝, 최적화, 성능, 처리량, 비용, 카디널리티, 샘플링, 예산, 줄이기, 대역폭, MTU, 너무 많음, 비쌈, 오버헤드, 헤드 샘플링, 테일 샘플링 | +| Abbreviations / synonyms | high cardinality, metric explosion, bill shock, DPM (data points per minute), ingest cost, backpressure | + +**Signals:** +- Cost number with "jumped", "tripled", "too high" +- Cardinality, label explosion, or DPM framing +- UDP MTU or OTLP throughput question +- Sampling ratio or rate-limit configuration + +#### Example Queries + +- "Datadog bill jumped 3x — need to reduce cardinality" +- "UDP statsd throughput is low at peak" +- "Datadog 비용 3배 뛰었음, 카디널리티 줄여야 해" +- "테일 샘플링 레시피 추천" +- "How do I set a cost-aware tail sampling policy for the checkout service?" + +#### Primary Route + +`resources/transport/` (all 4 files: `udp-statsd-mtu.md`, `otlp-grpc-vs-http.md`, `collector-topology.md`, `sampling-recipes.md`); `resources/meta-observability.md §Cardinality` + +#### Secondary Considerations + +`resources/vendor-categories.md` for alternative tool selection when current vendor is causing the cost spike + +--- + +### Intent 7: `route` + +Multi-tenant, multi-cloud, or multi-region telemetry routing, isolation, data residency, or federation. + +#### Keywords + +| Language | Keywords | +|----------|----------| +| English | route, routing, multi-tenant, multi-cloud, region, residency, isolation, segregation, gateway, fan-out, federation, data locality, GDPR residency, pipeline split, tenant routing | +| Korean | 라우팅, 멀티테넌트, 멀티클라우드, 리전, 데이터 거주, 격리, 분리, 페더레이션, 게이트웨이, 팬아웃, 테넌트 라우팅 | +| Abbreviations / synonyms | data sovereignty, geo-fencing, per-tenant collector, routing_connector, OTel Collector routing | + +**Signals:** +- Specific cloud region name + compliance or residency requirement +- "Tenant A vs Tenant B" pipeline separation +- Multiple collectors or clusters mentioned with routing context +- GDPR, PIPA, or data sovereignty regulation cited + +#### Example Queries + +- "Route tenant A telemetry to ap-northeast-2 only (GDPR residency)" +- "Federated collectors across 3 k8s clusters" +- "KR 데이터 거주 요건으로 리전별 Collector 분리" +- "엔터프라이즈 테넌트 전용 Collector 분리" +- "How do I fan-out logs to two different Loki stacks by tenant tier?" + +#### Primary Route + +`resources/boundaries/multi-tenant.md`; `resources/transport/collector-topology.md` + +#### Secondary Considerations + +`resources/boundaries/cross-application.md` for trust boundary enforcement when routing crosses application domains + +--- + +## Ambiguous / Fallback + +When no intent is detected with confidence: + +- Default to `investigate + tune` in parallel (observability work is often both diagnosing a problem and reducing noise) +- Present both result sets; let the user pick +- If automation is critical (e.g., CI pipeline), require the user to pass an explicit flag + +**Ambiguity resolution examples:** + +| Query | Detected Intent | Reason | +|-------|----------------|--------| +| "OTel Collector 설정" | `setup` (or `tune` if already deployed) | "설정" matches setup; check for existing deployment context | +| "p99 높음, 왜 그럼?" | `investigate` | Problem symptom + causal question | +| "샘플링 어떻게 해?" | `tune` | Sampling = throughput/cost optimization | +| "테넌트 데이터 분리" | `route` | Isolation + tenant framing | +| "SLO 알람 설정" | `alert` | SLO + alarm keywords | +| "Fluentd 버리고 싶어" | `migrate` | Deprecation + replacement intent | +| "Istio 트레이스 전파" | `trace` | Mesh + trace context propagation | +| "OTel 뭔가 이상함" (no detail) | `investigate + tune` fallback | No specific signal; dispatch parallel | + +--- + +## Matching Algorithm + +1. Lower-case both query and keyword list +2. For each intent, count keyword hits (whole-word boundary match) +3. Pick the intent with the highest score +4. On a tie, apply tiebreak priority: `investigate` > `setup` > `tune` > `alert` > `trace` > `route` > `migrate` (ordered by business-incident impact) +5. If all scores are zero: fall back to `investigate + tune` parallel with a clarification prompt + +--- + +## Integration with Hooks + +- Keywords from each intent feed `.agents/hooks/core/triggers.json` for auto-detection at `UserPromptSubmit` +- Intent classification result is consumed by `resources/execution-protocol.md §Step 1` to select the primary resource file set +- Override flags (`--investigate`, etc.) bypass keyword scoring entirely and force the named intent diff --git a/.agents/skills/oma-observability/resources/layers/L3-network.md b/.agents/skills/oma-observability/resources/layers/L3-network.md new file mode 100644 index 0000000..8af6e41 --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/L3-network.md @@ -0,0 +1,281 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# L3-Network Layer Observability + +## 1. Scope + +**In scope**: IP routing, VPC flow logs, ICMP (reachability and PMTUD), Path MTU Discovery, and — for organizations with their own Autonomous System Number (ASN) — BGP/BMP inter-AS routing monitoring. + +**Out of scope**: L1 physical (NIC, cable, IPMI) and L2 data-link (MAC, VLAN, STP). Those domains belong to vendor DCIM tooling (Nlyte, Sunbird, Device42). See `../SKILL.md §Out of Scope` for the full boundary declaration. + +**SaaS / cloud-native teams**: the primary artifact is cloud-provider VPC flow logs (Section 3). BGP/BMP (Section 6) applies only to ISPs, CDNs, and organizations that announce their own prefixes. SaaS-only teams using only VPC routing can skip Section 6. + +--- + +## 2. OTel Semantic Conventions Relevant to L3 + +The following attributes from the `network.*` group are **Stable** (semconv 1.27.0) and safe for production instrumentation. Reference: `../standards.md §3` for the full stability tier table. + +| Attribute | Type | Example values | Notes | +|-----------|------|----------------|-------| +| `network.type` | string enum | `ipv4`, `ipv6` | Stable; always set on socket-level spans | +| `network.transport` | string enum | `tcp`, `udp`, `quic` | Stable; distinguish protocol at L4 framing | +| `network.local.address` | string | `10.0.1.5` | Stable; local IP of the socket | +| `network.local.port` | int | `443` | Stable; local port | +| `network.peer.address` | string | `192.0.2.10` | Stable; remote IP — treat as PII, see Section 7 | +| `network.peer.port` | int | `8080` | Stable; remote port | +| `network.protocol.name` | string | `http`, `grpc` | Stable; application protocol atop L3/L4 | + +Note: `network.connection.*` (e.g., `network.connection.type`, `network.connection.subtype`) are **Development** tier and should not be used in production SLOs. Reference: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> + +OTel does not define BGP-specific semconv. BGP monitoring uses a separate pipeline (Section 6) that is explicitly outside the OTLP ecosystem. + +--- + +## 3. Cloud VPC Flow Logs + +VPC flow logs capture IP-layer traffic metadata: source address, destination address, bytes, packets, protocol, and firewall action. They are the primary L3 observability artifact for cloud-native teams. + +### 3.1 Per-Cloud Collection Pipeline + +| Cloud | Flow log source | Recommended OTel receiver | Key config | +|-------|----------------|--------------------------|------------| +| AWS | VPC Flow Logs → CloudWatch Logs or S3 | `cloudwatchlogsreceiver` or `filelogreceiver` (S3 path) | Enable `pkt-srcaddr`, `pkt-dstaddr`, `bytes`, `packets`, `action`, `protocol` fields in flow log format v3+ | +| GCP | VPC Flow Logs → Cloud Logging | `googlecloudpubsubreceiver` (via Pub/Sub export sink) | Configure aggregation interval (5s–15min); lower interval increases cost | +| Azure | NSG Flow Logs v2 → Storage Account → Event Hubs | `kafkareceiver` (Event Hubs uses Kafka protocol) | Enable Traffic Analytics for enriched flow data | + +References: +- AWS: <https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html> +- GCP: <https://cloud.google.com/vpc/docs/flow-logs> +- Azure: <https://learn.microsoft.com/azure/network-watcher/nsg-flow-logs-overview> + +### 3.2 Key Flow Log Fields + +| Field | AWS name | GCP name | Azure name | Notes | +|-------|----------|----------|------------|-------| +| Source IP | `srcaddr` | `src_ip` | `sourceAddress` | PII — see Section 7 | +| Destination IP | `dstaddr` | `dest_ip` | `destinationAddress` | PII — see Section 7 | +| Source port | `srcport` | `src_port` | `sourcePort` | | +| Destination port | `dstport` | `dest_port` | `destinationPort` | | +| Protocol | `protocol` | `protocol` | `protocol` | IANA protocol number | +| Bytes | `bytes` | `bytes_sent` | `bytesForwardedDenied` + `bytesForwardedAllowed` | Used for egress cost attribution | +| Packets | `packets` | `packets_sent` | `packetsForwardedAllowed` | | +| Firewall action | `action` | N/A (use firewall logs) | `trafficType` | `ACCEPT` / `REJECT` — audit signal | +| Start/end time | `start`, `end` | `start_time`, `end_time` | `startTime`, `endTime` | | + +### 3.3 Privacy Note + +`srcaddr` and `dstaddr` (and their equivalents) are IP addresses. Under GDPR Article 4(1) and PIPA, IP addresses linked to natural persons are personal data. Before long-term retention: + +- Truncate the last octet for IPv4 (e.g., `203.0.113.0/24` prefix preservation). +- Hash with a rotating salt (SHA-256 minimum) for per-flow pseudonymization. +- Do not store raw IP addresses in shared observability backends unless the tenant has explicit lawful basis. + +Cross-reference: `../signals/privacy.md §IP addresses` for the full masking decision tree. + +--- + +## 4. PMTUD — Path MTU Discovery + +### 4.1 Mechanism + +Path MTU Discovery (PMTUD, RFC 1191 for IPv4; RFC 8201 for IPv6) allows a sender to discover the maximum transmission unit supported along an end-to-end path without fragmentation. RFC 8899 defines Datagram PLPMTUD (DPLPMTUD), a transport-level variant used by QUIC and SCTP that does not rely on ICMP. + +IPv4 PMTUD relies on ICMP Type 3 Code 4 ("Fragmentation Needed, DF set") messages returned by intermediate routers when a packet is too large and the DF (Don't Fragment) bit is set. The sender reduces its segment size upon receiving this ICMP message. + +### 4.2 PMTUD Black Hole + +**Root cause**: firewall rules that block ICMP Type 3 Code 4 prevent the sender from receiving the path MTU signal. The sender continues transmitting oversized segments that are silently dropped. + +**Symptom pattern**: +- Small payloads (< 576 bytes) succeed; large payloads (> path MTU) stall or fail. +- TCP connection establishes (SYN/SYN-ACK are small) but bulk data transfer hangs. +- HTTP responses for large objects time out; health checks (small packets) pass. + +**Detection in flow logs**: look for sessions with normal packet counts but abnormally low byte counts per session; correlate with application-layer timeout logs. + +**Remediation options**: +1. **MSS clamping** (TCP Maximum Segment Size): configure at the firewall or router to clamp MSS to `interface_MTU - 40` (IPv4) or `interface_MTU - 60` (IPv6). This is the preferred fix for VPN and tunnel endpoints. +2. **Explicit MTU pinning**: set the outgoing interface MTU on the host to a conservative value (e.g., 1452 for PPPoE, 1422 for GRE over IPSec). +3. **ICMP passthrough**: allow ICMP Type 3 Code 4 through security groups and firewalls. Verify with: `ping -M do -s 1472 <destination>`. + +Cross-reference: `../transport/udp-statsd-mtu.md` for related UDP MTU thresholds (1472 IPv4, 1452 IPv6, 8192 UDS) that interact with the same path MTU constraints. + +--- + +## 5. ICMP Observability + +### 5.1 Active Monitoring Tools + +| Tool | Signal type | Use case | +|------|------------|---------| +| Blackbox Exporter (Prometheus) | ICMP echo probe | Synthetic availability from a vantage point | +| SmokePing | ICMP RTT time series | Long-term latency trend and packet-loss distribution | +| `ping` (scripted) | ICMP echo RTT | Ad hoc reachability during incident investigation | +| `traceroute` / `mtr` | ICMP TTL-exceeded path | Hop-by-hop latency and packet-loss localization | + +### 5.2 Cloud Provider ICMP Constraints + +| Cloud | ICMP behavior | Notes | +|-------|--------------|-------| +| AWS | ICMP echo blocked by default on security groups | Explicitly allow inbound ICMP Type 8 (echo request) for Blackbox Exporter probes to work | +| GCP | ICMP echo allowed by default in default VPC firewall rules | Review custom VPC firewall rules | +| Azure | ICMP echo allowed; NSG allows by default for same-VNet | Blocked at NSG if explicit deny rule added | + +ICMP Rate limits: all major clouds rate-limit ICMP responses from managed infrastructure (e.g., NAT gateways, load balancers). Do not interpret ICMP loss at these endpoints as a network fault; correlate with TCP-level application metrics. + +### 5.3 ICMP in OTel + +OTel does not define ICMP-specific semconv. Use the Blackbox Exporter's native Prometheus metrics (`probe_success`, `probe_duration_seconds`, `probe_icmp_reply_hop_limit`) and scrape them via the OTel Collector `prometheusreceiver`. Tag probes with `network.peer.address` for correlation. + +--- + +## 6. BGP / Inter-AS Routing (Advanced Subsection) + +**Scope boundary**: this subsection applies to **ISPs, CDNs, and organizations that own and operate an Autonomous System (ASN)** and announce their own IP prefixes via BGP. SaaS-only teams that rely entirely on cloud-provider VPC routing can skip this subsection. + +OTel does not define BGP semconv. BGP monitoring is a parallel pipeline ecosystem documented here for completeness. It does not use OTLP as the transport. + +### 6.1 BGP Monitoring Protocol (BMP) + +BMP (RFC 7854, <https://www.rfc-editor.org/rfc/rfc7854>) is the standard protocol for exporting real-time BGP session state, RIB (Routing Information Base) snapshots, and route change events from a BGP speaker to a monitoring collector. + +**BMP exporters** (BGP speakers that support BMP): + +| Platform | BMP support | +|----------|------------| +| FRRouting (OSS) | Native BMP export (`bmp` config stanza) | +| Cisco IOS-XR | Native BMP (`router bgp ... bmp server`) | +| Juniper Junos | Native BMP (`protocols bgp monitoring-protocol bmp`) | +| BIRD 2.x | BMP via `bmp` protocol block | + +### 6.2 BMP Collectors and Aggregators + +| Tool | Role | Notes | +|------|------|-------| +| OpenBMP | Open-source BMP collector + PostgreSQL/ClickHouse backend | Reference implementation; supports MOAS detection | +| pmacct | Multi-purpose network accounting; BMP + BGP support | Used at large ISPs; outputs to Kafka, InfluxDB, Elasticsearch | +| SNAS (Streaming Network Analytics System) | BMP → Kafka pipeline; CAIDA-affiliated | Suitable for research and large-scale ISP deployments | +| GoBMP | Lightweight Go BMP collector | Good for parsing and forwarding to custom pipelines | + +### 6.3 Recommended Pipeline + +``` +BGP Speaker (FRR / IOS-XR / Junos) + | BMP (RFC 7854, TCP port 11019) + v +BMP Collector (OpenBMP / pmacct / GoBMP) + | Kafka topic: bgp.updates + v +Stream Processor (Kafka Streams / Flink) + | enrichment: RPKI ROA lookup, MOAS detection + v +ClickHouse (columnar storage for BGP RIB history) + | + v +Grafana (BGP topology dashboards, hijack alerts) +``` + +This pipeline is **not OTel**. It operates independently of OTLP. The two pipelines (OTel metrics/logs/traces and BGP BMP) share only the storage and visualization layer (ClickHouse + Grafana). + +### 6.4 Security Observability: Hijack and Route Leak Detection + +#### BGP Hijack — MOAS Detection + +A BGP hijack occurs when a rogue AS announces a prefix it does not legitimately own, diverting traffic. Multiple Origin AS (MOAS) detection identifies prefixes announced by more than one origin AS simultaneously. + +Detection: compare real-time BMP route updates against a known-good origin AS database (your own RPKI ROAs + IRR records). Alert on any observed origin AS that does not match the expected set. + +Tools: OpenBMP includes MOAS detection; BGPalerter (<https://github.com/nttgin/BGPalerter>) provides real-time alerting on MOAS, route leaks, and more-specific prefix announcements. + +#### Route Leaks + +A route leak occurs when a route received from one BGP neighbor is re-announced to another neighbor in violation of routing policy (e.g., customer routes leaked to a peer). Detection requires policy-aware analysis of AS_PATH attributes. + +Tool: ARTEMIS (<https://github.com/FORTH-ICS-INSPIRE/artemis>) provides automated detection and mitigation orchestration for both hijacks and route leaks. + +#### RPKI-ROV (Route Origin Validation) + +RPKI (Resource Public Key Infrastructure) and ROA (Route Origin Authorization) records bind a prefix to an authorized origin AS with a cryptographic signature. ROV (Route Origin Validation) is the process of checking incoming BGP announcements against the RPKI trust anchor. + +**Verify your own prefixes**: ensure all prefixes you announce have valid ROA records in your RIR (ARIN, RIPE NCC, APNIC, LACNIC, AFRINIC). An announced prefix without an ROA is marked "Not Found" by downstream validators — not "Invalid", but not cryptographically anchored either. + +**Enforce ROV on your router**: configure BGP to drop or de-prefer routes with RPKI Invalid status. Reference NIST SP 800-189 (<https://csrc.nist.gov/publications/detail/sp/800-189/final>) for deployment guidance. + +**Tools**: Cloudflare Radar RPKI dashboard (<https://radar.cloudflare.com/routing/rpki>) provides global visibility into ROA coverage and RPKI Invalid route counts by ASN. + +### 6.5 Public BGP Feeds for Baselining + +| Feed | Provider | Use | +|------|----------|-----| +| RIPE RIS (Routing Information Service) | RIPE NCC | Global BGP route collector data; MRT format | +| RouteViews | University of Oregon / CAIDA | Historical BGP table snapshots; MRT format | +| BGPStream | CAIDA | Streaming API over RIPE RIS + RouteViews; Python/C library | +| Cloudflare Radar | Cloudflare | Public BGP anomaly detection; hijack and outage notifications | + +Use these feeds to baseline what your prefixes look like to the global routing table and to cross-check your own RPKI ROA coverage. + +--- + +## 7. Matrix Cross-Reference (L3 Row) + +The L3 row of `../matrix.md` covers 28 cells (4 boundaries × 7 signals). The following summarizes the coverage status and primary artifacts: + +| Boundary | metrics | logs | traces | profiles | cost | audit | privacy | +|----------|---------|------|--------|----------|------|-------|---------| +| multi-tenant | ✅ per-tenant egress byte/packet counters from VPC flow logs | ✅ VPC flow stream tagged by tenant CIDR | ⚠️ trace-ID egress tagging only; no native L3 trace context | ❌ N/A | ⚠️ egress byte attribution as cost proxy | ✅ VPC flow audit trail tagged by tenant | ⚠️ IP addresses are PII (GDPR/PIPA) — mask before retention | +| cross-application | ✅ inter-VPC peering flow metrics | ✅ VPC flow logs across peering / transit gateway | ⚠️ socket 5-tuple correlation to L7 spans only | ❌ N/A | ⚠️ cross-VPC egress cost proxy | ✅ inter-VPC flow audit for SOC2 network controls | ⚠️ source/destination IPs crossing app boundary are PII candidates | +| slo | ❌ N/A — SLO belongs at L7 | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | +| release | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | + +Notes: +- **SLO cells**: all N/A. SLO error budgets are computed from L7 application metrics. If an L3 event (BGP hijack, PMTUD black hole) causes an SLO burn, the causal chain surfaces as an L7 error-rate spike first. Investigate via `../incident-forensics.md` 6-dimension localization. +- **Audit cells**: VPC flow logs with `ACCEPT`/`REJECT` actions satisfy SOC2 CC6.6 (network access control) audit evidence requirements. Store in WORM-compatible storage (S3 Object Lock, GCS retention policy, Azure immutable blob) per `../signals/audit.md`. +- **Privacy cells**: cross-reference `../signals/privacy.md §IP addresses` for the masking decision tree. The threshold for masking versus pseudonymization depends on the data's correlation potential with other identifiers. + +--- + +## 8. Anti-Patterns + +The following are candidates for `../anti-patterns.md §Section H` (L3-specific): + +| Anti-pattern | Risk | Remediation | +|-------------|------|------------| +| Own-ASN BGP hijack left unmonitored | Prefix hijack undetected for hours or days; traffic silently diverted | Deploy BGPalerter or ARTEMIS with MOAS detection; subscribe to Cloudflare Radar alerts for your prefixes | +| Raw IP addresses retained in logs without redaction | GDPR Article 5(1)(c) data minimization violation; 4% global turnover fine risk | Truncate last octet on ingest; use rotating-salt SHA-256 for pseudonymization; see `../signals/privacy.md` | +| RPKI-ROV not configured on advertised prefixes | Rogue origin AS can announce your prefixes without RPKI Invalid signal to downstream validators | Create ROA records at your RIR; enable ROV enforcement on border routers | +| PMTUD black hole left uncorrected | Large TCP transfers stall; health checks pass (small packets), masking the problem | Enable MSS clamping at VPN/tunnel endpoints; allow ICMP Type 3 Code 4 through firewalls | + +--- + +## 9. References + + +Internal cross-references: + +- `../standards.md §3` — OTel semconv stability tiers +- `../matrix.md §Layer: L3-network` — full 28-cell coverage map +- `../signals/privacy.md §IP addresses` — IP address masking decision tree +- `../signals/audit.md` — WORM storage and SOC2 audit trail requirements +- `../signals/cost.md §egress attribution` — L3 egress byte cost proxy +- `../transport/udp-statsd-mtu.md` — UDP MTU thresholds (1472/1452/1432/8192/16K) +- `../incident-forensics.md` — 6-dimension localization when L3 event causes L7 SLO burn + +## References + +- RFC 7854 — BGP Monitoring Protocol (BMP): <https://www.rfc-editor.org/rfc/rfc7854> +- RFC 1191 — Path MTU Discovery (IPv4): <https://www.rfc-editor.org/rfc/rfc1191> +- RFC 8201 — Path MTU Discovery for IPv6: <https://www.rfc-editor.org/rfc/rfc8201> +- RFC 8899 — Datagram PLPMTUD (used by QUIC/SCTP): <https://www.rfc-editor.org/rfc/rfc8899> +- RFC 6811 — BGP Prefix Origin Validation (RPKI-ROV): <https://www.rfc-editor.org/rfc/rfc6811> +- IANA ICMP Type registry: <https://www.iana.org/assignments/icmp-parameters/icmp-parameters.xhtml> +- IANA protocol numbers: <https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml> +- NIST SP 800-189 (RPKI deployment): <https://csrc.nist.gov/publications/detail/sp/800-189/final> +- Cloudflare Radar (public BGP + RPKI): <https://radar.cloudflare.com/routing> +- OTel network semconv registry: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> +- AWS VPC Flow Logs: <https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html> +- GCP VPC Flow Logs: <https://cloud.google.com/vpc/docs/flow-logs> +- Azure NSG Flow Logs: <https://learn.microsoft.com/azure/network-watcher/nsg-flow-logs-overview> diff --git a/.agents/skills/oma-observability/resources/layers/L4-transport.md b/.agents/skills/oma-observability/resources/layers/L4-transport.md new file mode 100644 index 0000000..927136e --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/L4-transport.md @@ -0,0 +1,299 @@ +--- +otel_semconv: "1.27.0 (2024-11)" +--- + +# L4 Transport Layer Observability + +> **DISAMBIGUATION — read before proceeding.** +> +> This file covers **observing the OSI Layer 4 (TCP/UDP/QUIC) transport in your application systems** — +> retransmits, RTT, connection lifecycle, eBPF-based socket instrumentation, and QUIC/HTTP3 semantics. +> +> This file is **NOT** about the observability pipeline transport: +> - `../transport/udp-statsd-mtu.md` — StatsD payload sizing over UDP +> - `../transport/otlp-grpc-vs-http.md` — OTLP exporter protocol choice +> - `../transport/collector-topology.md` — OTel Collector deployment topologies +> - `../transport/sampling-recipes.md` — tail/head sampling strategies +> +> The word "transport" is overloaded. When a cross-reference says `transport/`, it refers to the +> observability pipeline. When it says "L4", "TCP/UDP/QUIC", or this filename, it refers to this file. + +--- + +## 1. Scope + +OSI Layer 4 observability for production systems: TCP, UDP, QUIC/HTTP3, eBPF auto-instrumentation, +and connection lifecycle. Out of scope: L5 session (gRPC/WebSocket → L7), L6 full TLS inspection +(use Wireshark/vendor tooling), security-focused eBPF (Cilium Tetragon/Falco → security skill). +See `../standards.md §5` for the authoritative OSI boundary decision. + +--- + +## 2. L4 Semconv (OTel) + +Source: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> + +| Attribute | Values | Stability | Notes | +|-----------|--------|-----------|-------| +| `network.transport` | `tcp`, `udp`, `unix`, `pipe`, `quic` | **Stable** | Set on spans and metrics describing a socket | +| `network.protocol.name` | `http`, `grpc`, `amqp`, … | **Stable** | Application protocol over the transport | +| `network.protocol.version` | `1.1`, `2`, `3` | **Stable** | `3` = HTTP/3 over QUIC | +| `network.peer.address` | IP or hostname | **Stable** | Apply IP-masking before retention — `../signals/privacy.md §IP addresses` | +| `network.connection.state` | `established`, `close_wait`, … | **Development** | Do not build SLOs against this; schema may change | +| `network.connection.type` | `wifi`, `cell`, … | **Development** | Mobile/RUM only; flag in production use | + +`network.connection.*` attributes are Development tier (`../standards.md §3`). Use them in test +environments and tolerate breaking changes before promoting to production dashboards. + +--- + +## 3. TCP Observability + +### 3.1 Key Metrics + +| Metric | Type | Measures | Source | +|--------|------|----------|--------| +| `tcp_retransmits_total` | Counter | Retransmit segments; rate spike = congestion or loss | hostmetrics, eBPF | +| `tcp_rtt_ms` (p50/p99) | Histogram | Round-trip time per connection | eBPF socket stats | +| `tcp_active_connections` | Gauge | In-flight established connections; growth = pool leak | `/proc/net/sockstat` | +| `tcp_time_wait_connections` | Gauge | TIME_WAIT count; high under short-lived connection churn | `/proc/net/sockstat` | +| `tcp_syn_queue_drops_total` | Counter | SYN queue overflow; SYN flood or low `tcp_max_syn_backlog` | netlink / eBPF | +| `tcp_close_wait_connections` | Gauge | CLOSE_WAIT growth = server not closing half-open connections | hostmetrics | + +```promql +# Retransmit rate per node +rate(tcp_retransmits_total[5m]) + +# p99 RTT per service +histogram_quantile(0.99, sum by (le, service_name) (rate(tcp_rtt_ms_bucket[5m]))) + +# TIME_WAIT pressure alert +tcp_time_wait_connections > 10000 +``` + +### 3.2 Collection + +| Method | Mechanism | Privilege | +|--------|-----------|-----------| +| OTel hostmetrics receiver | `/proc/net/tcp`, `/proc/net/snmp`, `/proc/net/sockstat` | Root or CAP_SYS_PTRACE | +| eBPF socket filter | kprobes on `tcp_retransmit_skb`, `tcp_sendmsg` | CAP_BPF (kernel ≥ 5.8) or root | +| Linux netlink SOCK_DIAG | TCP socket state enumeration | Unprivileged in most distros | + +### 3.3 Common Pitfalls + +**Connection pool exhaustion.** High `tcp_active_connections` with high application latency but no TCP errors +points to pool queue saturation, not network failure. Cross-ref: `../signals/traces.md §DB connection pool patterns`. + +**SYN cookie activation.** `tcp_syn_queue_drops_total` > 0 in normal traffic means `tcp_max_syn_backlog` +is too low. SYN cookies mask drop counters in `/proc/net/snmp`; use eBPF for reliable counts. + +**TIME_WAIT accumulation.** Short-lived HTTP/1.1 connections exhaust ephemeral ports. Remediation: +`SO_REUSEADDR`, wider `ip_local_port_range`, or HTTP/2 multiplexing. + +--- + +## 4. UDP Observability + +| Metric | Source | Measures | +|--------|--------|----------| +| `udp_datagrams_sent_total` | `/proc/net/snmp` | Baseline send throughput | +| `udp_receive_buffer_errors_total` | `/proc/net/snmp` `RcvbufErrors` | Socket buffer overflow; increase `net.core.rmem_max` | +| `udp_no_ports_total` | `/proc/net/snmp` `NoPorts` | Drop because no listener on destination port | +| `udp_checksum_errors_total` | `/proc/net/snmp` `InCsumErrors` | Corrupted datagrams; NIC or path corruption | + +OTel hostmetrics `network` scraper covers these counters automatically. +Cross-ref: `../transport/udp-statsd-mtu.md` for StatsD payload sizing constraints that affect +`udp_receive_buffer_errors_total` in high-throughput StatsD pipelines. + +--- + +## 5. eBPF-Based L4 Observability + +eBPF instruments the Linux kernel without application code changes. Three tools are in scope. + +### 5.1 Tool Comparison + +| Tool | CNCF Status | Primary signal | OTel output | Key use case | +|------|-------------|----------------|-------------|--------------| +| **Grafana Beyla** | CNCF Incubating (2024) | HTTP/gRPC traces + Go/C++ runtime | OTLP natively | Zero-code HTTP/gRPC auto-instrumentation via uprobes; also emits `tcp_rtt_us` histograms. Requires kernel ≥ 5.2 with BTF enabled | +| **Pixie** | CNCF Sandbox | Multi-protocol traces + L4 metrics via PxL | OTLP via `px-export` | Scriptable cluster-wide eBPF; auto-discovers pods | +| **Parca** | CNCF Sandbox | Continuous CPU + off-CPU flame graphs | pprof-compatible | Whole-cluster profiling; cross-ref `../signals/profiles.md` | + +Sources: <https://grafana.com/oss/beyla/>, <https://px.dev/>, <https://parca.dev/> + +**Out of scope: Cilium Tetragon.** Security-focused runtime policy enforcement (syscall filtering, process +exec tracing) belongs to a security skill. Falco is the comparable alternative for kernel security events. + +### 5.2 Kernel and Privilege Requirements + +| Requirement | Detail | +|-------------|--------| +| Minimum kernel | ≥ 4.14 for most eBPF features; ≥ 5.8 for CAP_BPF without root | +| Capabilities | CAP_BPF + CAP_PERFMON (kernel ≥ 5.8) or root on older kernels | +| GKE Autopilot | eBPF DaemonSets disallowed; use Standard node pools | +| EKS Fargate | No DaemonSet support; eBPF unavailable | +| AKS | Supported on standard node pools; verify with `uname -r` | + +```bash +# Preflight kernel check (run in initContainer or node admission) +uname -r # must be >= 4.14, ideally >= 5.8 +capsh --print | grep cap_bpf + +# Beyla deploy via Helm +helm repo add grafana https://grafana.github.io/helm-charts +helm install beyla grafana/beyla --namespace beyla --create-namespace +``` + +### 5.3 Pixie PxL (Conceptual) + +```python +# Conceptual PxL — TCP retransmit counts per pod +import px +df = px.DataFrame(table='tcp_retransmits', start_time='-5m') +df = df.groupby(['pod', 'namespace']).agg(retransmits=('retransmits', px.sum)) +px.display(df) +``` + +--- + +## 6. Connection Lifecycle Observability + +``` +LISTEN → SYN_RECEIVED → ESTABLISHED → (data transfer) + → FIN_WAIT_1 → FIN_WAIT_2 → TIME_WAIT → CLOSED (active close) + → CLOSE_WAIT → LAST_ACK → CLOSED (passive close) +``` + +| State | Observable signal | Tooling | +|-------|------------------|---------| +| SYN → ESTABLISHED | Setup latency (SYN-ACK RTT) | eBPF `tcp_v4_connect` kprobe | +| ESTABLISHED idle | Keepalive probe success/failure | `/proc/net/tcp` + hostmetrics | +| FIN_WAIT / TIME_WAIT | TIME_WAIT gauge; ephemeral port pressure | hostmetrics `/proc/net/sockstat` | +| CLOSE_WAIT growth | Server not releasing half-open connections | hostmetrics + eBPF alert | + +**TLS handshake boundary.** TLS sits at L4/L5. `tls.*` semconv is Development tier — do not build +production SLOs on it. Use eBPF uprobes on OpenSSL/BoringSSL or vendor tooling for handshake latency. +Cross-ref: `../signals/privacy.md §TLS context`. + +**Keepalive tuning.** TCP keepalive interval must be shorter than the upstream LB idle timeout +(AWS ALB: 60 s; AWS NLB: 350 s; GCP CLB: 600 s default). Mismatches cause silent connection resets. + +**WebSocket and gRPC streams** — full lifecycle coverage is deferred to L7 (`../standards.md §5`). +Long-lived streams mitigate TIME_WAIT accumulation but introduce reconnection-storm risk on restart. + +--- + +## 7. QUIC / HTTP/3 Transport Semantics + +QUIC (RFC 9000) is a UDP-based transport with built-in TLS 1.3, stream multiplexing, and connection +migration. HTTP/3 (RFC 9114) runs exclusively over QUIC. + +### 7.1 Observability Differences from TCP + +| TCP concept | QUIC equivalent | Observability impact | +|-------------|----------------|----------------------| +| 3-way SYN handshake | 1-RTT or 0-RTT | No SYN/FIN to observe; connection state is QUIC-internal | +| 5-tuple for correlation | Connection ID (opaque bytes) | Connection ID survives NAT rebinding; log it on establishment | +| `/proc/net/tcp` retransmit | QUIC loss recovery (packet number gaps) | Not visible in proc; requires QUIC-aware tooling | +| TCP FIN / RST | QUIC GOAWAY / CONNECTION_CLOSE frames | Surfaced in Envoy access logs | +| HTTP/2 HOL blocking | Per-stream flow control; no HOL blocking | Stream errors do not block other streams | + +### 7.2 OTel Semconv for QUIC + +``` +network.transport: quic +network.protocol.name: http +network.protocol.version: "3" +``` + +### 7.3 0-RTT Observability + +0-RTT sends data before the handshake completes. Emit a span attribute or metric label +distinguishing 0-RTT from 1-RTT connections to detect replay exposure on non-idempotent endpoints. + +### 7.4 Firewall and Enterprise Deployment Considerations + +Enterprise networks frequently block outbound UDP 443 and UDP 8443 because legacy firewall rules assume HTTP(S) is TCP-only. QUIC (and therefore HTTP/3) rides on UDP and silently falls back to HTTP/1.1 or HTTP/2 when UDP is blocked — the fallback is invisible to application telemetry unless you instrument it. + +Detect QUIC blocking: +- Emit `network.transport` attribute on client spans; compare ratio of `quic` vs `tcp` across deployment zones +- CDN access logs (Cloudflare, Fastly) expose QUIC negotiation outcome per edge +- Alert when `quic` ratio drops below baseline (for example, office network where UDP 443 is firewalled) + +Action: coordinate with network security to allowlist UDP 443/8443 for validated destinations; document the fallback behavior so regressions do not silently halve transport performance. + +### 7.5 Tool Support (2026) + +| Tool | QUIC support | +|------|-------------| +| Envoy 1.29+ | QUIC upstream and downstream; stats in admin API and access logs | +| Istio 1.22+ | QUIC upstream via Envoy; mTLS over QUIC experimental — see `layers/mesh.md` | +| Grafana Beyla | HTTP/3 uprobes in roadmap; verify release notes before QUIC-heavy deployments | +| Cloudflare QUIC telemetry | <https://blog.cloudflare.com/tag/quic/> — public connection migration + 0-RTT metrics | +| Google QUIC metrics | <https://research.google> — packet loss recovery timing | + +--- + +## 8. OS-Level Integration + +| Source | Signal | Cross-ref | +|--------|--------|-----------| +| systemd-journal / syslog | TCP resets, netfilter drops, NIC errors | `../signals/logs.md §OS-level log sources` | +| cgroup v2 net accounting | Per-container byte/packet counters | `../transport/collector-topology.md §kubernetes` | +| nf_conntrack | NAT and stateful firewall connection table size | Requires `nf_conntrack` kernel module | + +--- + +## 9. Matrix Cells — L4 Row + +From `../matrix.md §L4-transport`: + +| Cell | Symbol | Rationale | +|------|--------|-----------| +| L4 × cross-application × traces | ⚠️ | TCP is not trace-native; log socket 5-tuple for post-hoc correlation to L7 spans only | +| L4 × * × profiles | ✅ | eBPF off-CPU profiling via Parca captures kernel socket overhead and network-wait; pprof output | +| L4 × privacy (all boundaries) | ⚠️ | Source/destination IPs in TCP logs are PII (GDPR Art. 4(1)); apply masking at pipeline ingress | +| L4 × SLO and L4 × release | ❌ | SLO budgets and release markers are application-layer; TCP health is infra-SLI fallback only | + +--- + +## 10. Anti-Patterns — Candidates for `../anti-patterns.md §Section H` + +**H-L4-1: Connection pool observability absent.** Pool queue saturation is invisible in TCP metrics +alone; neither retransmit rate nor error rate spikes until the pool times out. Instrument pool size, +wait time, and timeout counters at the application layer. Severity: HIGH. + +**H-L4-2: QUIC adoption without HTTP/3 trace tooling validation.** Enabling QUIC without verifying +that OTel SDKs emit `network.transport: quic` and `network.protocol.version: "3"` creates a +transport-layer blind spot. Add a canary assertion in staging before production rollout. Severity: HIGH. + +**H-L4-3: eBPF agent deployed without kernel/capability preflight.** Beyla or Pixie DaemonSets on +incompatible kernels fail silently with no operator-visible error. Add an `initContainer` asserting +kernel version and CAP_BPF presence; fail non-zero if requirements are unmet. Severity: HIGH. + +**H-L4-4: WebSocket reconnection storm undetected.** Server restart without graceful WebSocket drain +causes simultaneous mass-reconnect. TCP SYN counts look like normal new connections. Instrument client +reconnection counters and alert on rate spikes; implement exponential backoff + jitter. Severity: MEDIUM. + +--- + +## 11. References + +- OTel network attributes registry: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> +- Grafana Beyla: <https://grafana.com/oss/beyla/> +- Pixie (CNCF Sandbox): <https://px.dev/> +- Parca (CNCF Sandbox): <https://parca.dev/> +- QUIC RFC 9000: <https://www.rfc-editor.org/rfc/rfc9000> +- HTTP/3 RFC 9114: <https://www.rfc-editor.org/rfc/rfc9114> +- Cloudflare QUIC telemetry: <https://blog.cloudflare.com/tag/quic/> +- OTel hostmetrics receiver: <https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver> +- Linux `/proc/net/tcp` format: <https://www.kernel.org/doc/html/latest/networking/proc_net_tcp.html> +- Semconv stability tiers: `../standards.md §3` +- OSI boundary decisions: `../standards.md §5` +- Coverage matrix (L4 row): `../matrix.md §L4-transport` +- IP masking rules: `../signals/privacy.md §IP addresses` +- Profiling (eBPF/Parca): `../signals/profiles.md` +- OS-level logs: `../signals/logs.md §OS-level log sources` +- Collector Kubernetes topology: `../transport/collector-topology.md` +- StatsD UDP sizing: `../transport/udp-statsd-mtu.md` +- Service mesh + QUIC: `layers/mesh.md` diff --git a/.agents/skills/oma-observability/resources/layers/L7-application/crash-analytics.md b/.agents/skills/oma-observability/resources/layers/L7-application/crash-analytics.md new file mode 100644 index 0000000..34a82a6 --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/L7-application/crash-analytics.md @@ -0,0 +1,359 @@ +--- +otel_semconv: "1.27.0 (2024-11)" +--- + +# Crash Analytics — L7 Application Layer + +## 1. Scope and Why Separate from RUM + +Crash analytics is not RUM. The distinction matters for pipeline design, KPI selection, and vendor choice. + +| Dimension | RUM | Crash Analytics | +|-----------|-----|----------------| +| Session type | Successful sessions (UX measured) | Abnormal terminations (failed sessions) | +| Primary KPI | Core Web Vitals / INP / startup time | Crash-Free Rate (CFR), ANR Rate | +| Pipeline | Beacon / SDK event stream → backend | Crash report queued on device → uploaded on restart | +| Symbolication | N/A (JS source maps for errors only) | dSYM / ProGuard / R8 / Dart symbols required | +| Offline behavior | Events dropped if offline | Report queued locally; uploaded on next launch | + +This file documents the common crash pipeline shared across mobile native, web, and backend uncaught exceptions. Platform-specific symbolication flows are cross-referenced: + +- Mobile native detail: `mobile-rum.md §Crash + ANR` +- Web JS error detail: `web-rum.md §Error Boundary and window.onerror` + +Platforms in scope: iOS native, Android native, React Native, Flutter, web (JS), backend uncaught exceptions. + +--- + +## 2. Core Metrics + +### Crash-Free Rate (CFR) + +CFR is the primary stability KPI for mobile and web applications. + +``` +Session CFR = (sessions without crash) / (total sessions) +User CFR = (users not experiencing any crash) / (total active users) +``` + +Targets by application category: + +| Category | Minimum Session CFR | Notes | +|----------|--------------------|----| +| Consumer mobile | 99.5% | Google Play bad-behavior threshold: < 99.0% triggers review | +| Enterprise / B2B mobile | 99.7% | Internal SLA baseline | +| Regulated (fintech, health) | 99.9%+ | Near-zero tolerance; compliance-linked | +| Web SPA | 99.8%+ | JS errors below `Error Boundary` counted as crashes | + +### ANR Rate and Hang Rate + +Near-crash experiences that degrade user perception without a fatal signal: + +- **Android ANR Rate** = ANRs / sessions. Two variants per `ApplicationExitInfo`: main-thread input dispatch > 5 s, or broadcast receiver not completing within 200 ms (foreground) / 60 s (background). Google Play flags apps exceeding 0.47% of sessions. +- **iOS Hang Rate** = main-thread hangs > 250 ms / sessions. Visible in Xcode Organizer → Hangs and via MetricKit `MXHangDiagnostic` (iOS 14+). + +### Supporting Metrics + +| Metric | Definition | +|--------|-----------| +| Affected Users % | `(users who hit crash X) / (DAU)` — scope of a specific crash group | +| Time-to-Symbolication | Duration from crash report ingestion to readable stack trace | +| Crash Volume | Raw crash event count; use with CFR, not instead of it | + +--- + +## 3. Platform-Specific Symbolication Pipelines + +Each platform requires a distinct artifact upload strategy. Failure to automate upload results in unreadable stack traces in production. + +| Platform | Symbol artifact | Upload trigger | Key complexity | +|----------|----------------|---------------|----------------| +| iOS | dSYM bundle (per build UUID) | Xcode archive → CI upload | Bitcode deprecated in Xcode 14; no re-symbolication | +| Android (JVM) | `mapping.txt` (ProGuard / R8) | Gradle task post-minify | Must match exact build; obfuscation mapping is one-way | +| Android (NDK) | `.so` files with debug info stripped separately | CI symbol upload | Requires `--build-id` linkage; `.so` from APK lacks debug info | +| React Native | JS sourcemap + iOS dSYM + Android mapping.txt | Three uploads per release | Hermes bytecode adds an extra sourcemap layer | +| Flutter | Dart symbol files from `--split-debug-info` | CI upload after `flutter build` | Both iOS dSYM and Android mapping still needed for platform channels | +| Web | JS sourcemap | CI deploy pipeline | `window.onerror` + `onunhandledrejection`; React/Vue Error Boundary | +| Backend | N/A (source available) | OTel `exception.*` attributes | Sentry Python/Node/Ruby SDK for crash-like grouping | + +### iOS + +dSYM (Debug Symbols) is the binary artifact that maps obfuscated addresses to function names and line numbers. Each Xcode build generates a UUID-stamped dSYM. The UUID must match the crash report exactly; re-using a dSYM from a different build produces wrong or partial symbolication. + +Bitcode was deprecated in Xcode 14. Prior to Xcode 14, Apple re-compiled bitcode for App Store distribution, requiring vendors (Firebase, Sentry, Datadog) to download the re-compiled dSYM from App Store Connect via the API. Xcode 14+ removes this complexity; the Xcode-generated dSYM is final. + +Apple App Store Connect crash API: Firebase Crashlytics, Sentry, and Datadog can pull crash data from App Store Connect for devices where the user has opted into analytics sharing. + +### Android + +ProGuard / R8 generates `mapping.txt` which maps obfuscated class/method names back to source. The NDK layer produces native `.so` libraries; the release variant strips debug info into a separate symbol file. Upload both `mapping.txt` and NDK symbol `.so` files for full stack traces covering Java/Kotlin and C++ frames. + +Play Console crashes API: available for complementary data, not a replacement for vendor SDK. + +### React Native + +Three separate symbol artifacts per release: + +1. JS bundle sourcemap (Metro bundler output) +2. iOS dSYM (for native frames) +3. Android `mapping.txt` (for native frames) + +Hermes JS engine compiles JS to bytecode. The sourcemap chain is: Hermes bytecode address → JS sourcemap → original TS/JS source. Some vendors (Sentry, Datadog) have Hermes-aware symbolication; verify vendor documentation before assuming standard sourcemap upload suffices. + +### Flutter + +`flutter build --split-debug-info=<dir> --obfuscate` produces Dart symbol files. These must be uploaded to the crash vendor's symbol storage. Platform-channel crashes (e.g., method channel calls into Kotlin/Swift) still require iOS dSYM and Android `mapping.txt` in addition to Dart symbols. + +### Web + +Cross-reference `web-rum.md §Error tracking`. Key patterns: +- `window.onerror` catches uncaught synchronous errors +- `window.addEventListener('unhandledrejection', ...)` catches unhandled Promise rejections +- React `ErrorBoundary`, Vue `app.config.errorHandler`, Svelte `<svelte:component this={ErrorBoundary}>` prevent full-page crashes and capture component-tree context + +### Backend + +Backend services rarely produce binary crash reports. OTel `exception.*` attributes (Stable semconv) capture the exception type, message, and stacktrace on spans. Sentry Python / Node.js / Ruby SDKs provide crash-like aggregation with grouping, release tracking, and breadcrumbs that mirror the mobile crash experience. + +--- + +## 4. Release Tracking Integration + +Release tracking is the most commonly omitted step in crash pipelines. Without it, a crash spike cannot be correlated to a specific deploy. + +**CI must set `service.version` on every build.** The OTel semconv `service.version` attribute (Stable) is the canonical release identifier. All crash events, spans, and metrics must carry this attribute. + +**Symbol upload must be automated per release.** Manual upload is an anti-pattern (see Section 12). Use: + +- Sentry CLI: `sentry-cli releases propose-version` + `sentry-cli releases files upload-sourcemaps` +- Fastlane plugin: `fastlane-plugin-sentry` for iOS/Android +- Gradle plugin: `io.sentry.android.gradle` for Android automatic upload +- Firebase: `firebase crashlytics:symbols:upload` for dSYM (Fastlane or Gradle) + +**Release marker events** are structured log records written at deploy time (see `../../boundaries/release.md`). Crash vendors use these to draw vertical lines on CFR trend charts, enabling before/after comparison. Flagger and Argo Rollouts canary analysis can use CFR as a promotion/rollback gate signal — cross-reference `../../boundaries/release.md §Canary analysis`. + +--- + +## 5. Vendor Comparison Matrix (as of 2026-Q2) + +Cross-reference `../../vendor-categories.md §Category (j): Crash Analytics`. + +| Vendor | iOS | Android | Web JS | Backend | OSS/Free | Key differentiator | +|--------|-----|---------|--------|---------|----------|--------------------| +| Firebase Crashlytics | Yes | Yes | No | No | Free | Google-owned; zero cost; mobile-only; dSYM auto-upload via Xcode plugin | +| Sentry | Yes | Yes | Yes | Yes | OSS + SaaS | Full-stack (mobile + web + backend); release tracking; OTel trace correlation | +| Bugsnag | Yes | Yes | Yes | Yes | Commercial | Stability score; smart grouping noise reduction; breadcrumb API | +| Embrace | Yes | Yes | No | No | Commercial | ANR/startup crash specialist; session replay; network body capture | +| Datadog Error Tracking | Yes | Yes | Yes | Yes | Commercial | Native RUM + APM integration; unified with Datadog traces | + +**Key differentiators:** + +- **Firebase Crashlytics**: best choice when budget is zero and stack is mobile-only. No web or backend support. Google ecosystem (Analytics, BigQuery export). +- **Sentry**: strongest cross-platform story. The only vendor in the matrix with meaningful OSS options. Release health dashboard is the most mature. OTel `trace_id` in crash context enables backend span lookup. +- **Bugsnag**: stability metrics and smart grouping reduce alert fatigue. Breadcrumb API is the most flexible for custom event recording. +- **Embrace**: purpose-built for mobile ANR, startup crash, and user-journey session replay. Not a general-purpose crash tool. +- **Datadog Error Tracking**: optimal when Datadog APM is already in use. RUM + APM linkage is native; no separate SDK integration. + +--- + +## 6. Crash Pipeline Flow + +``` +App crashes → crash report generated + (stack trace + breadcrumbs + device context + user context) + | + v + Queued locally on device (if offline — see mobile-rum.md §offline queuing) + | + v + Uploaded to vendor on next launch or when network available + | + v + Vendor-side symbolication + (dSYM / ProGuard / sourcemap matched by build UUID / version) + | + v + Deduplication by stack fingerprint + (vendor groups similar crashes into one issue) + | + v + Aggregation: Affected Users %, Event Count, CFR delta + | + v + Alert + ticket creation + (PagerDuty / OpsGenie for CFR breach; JIRA / Linear for issue triage) +``` + +--- + +## 7. Breadcrumbs and Context + +Breadcrumbs are the sequence of events recorded before the crash. They are the primary debugging context beyond the stack trace. + +| Category | Examples | +|----------|---------| +| User actions | Button taps, navigation events, form submissions | +| Network calls | HTTP request URL, status code, duration | +| Log events | Application log statements recorded by SDK | +| State changes | Authentication events, feature flag changes | + +Context attached to every crash report: + +| Field | Source | Notes | +|-------|--------|-------| +| `device.model` | OS API | Device model identifier | +| `os.version` | OS API | OS version string | +| `service.version` | Build config | Release version; must match symbol upload | +| `screen_resolution` | OS API | Pixel dimensions | +| `free_memory_mb` | OS API | Available RAM at crash time | +| `user.id` | App session | Hash or opaque ID only — see Section 8 | + +Custom keys (e.g., `order.id`, `tenant.id`, `experiment.variant`) enable correlation with business context. Restrict to non-PII values or apply redaction before send. + +--- + +## 8. Privacy and PII in Crash Reports + +Stack traces frequently contain PII because exception messages capture method arguments, query strings, and HTTP headers verbatim. + +**Common PII leaks in crash reports:** + +- SQL query in exception message containing `user.email` or SSN +- HTTP response body logged in a network error containing `Authorization: Bearer <token>` +- URL query parameters with `?email=user@example.com` +- Custom key set with `user.email` directly + +**Vendor SDK redaction hooks:** + +Sentry `beforeSend` callback (JavaScript/TypeScript example): + +```typescript +Sentry.init({ + dsn: "...", + beforeSend(event) { + // Strip email from exception values + if (event.exception?.values) { + event.exception.values = event.exception.values.map((ex) => ({ + ...ex, + value: ex.value?.replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, "[email]"), + })); + } + // Remove auth headers from request context + if (event.request?.headers) { + delete event.request.headers["Authorization"]; + delete event.request.headers["Cookie"]; + } + return event; + }, +}); +``` + +- **Firebase Crashlytics**: use `setCustomKey` with an explicit allowlist; never pass user.email or tokens as custom keys. +- **Datadog**: `beforeSend` callback mirrors Sentry's pattern; apply to both RUM and Error Tracking initializations. + +Cross-reference: `../../signals/privacy.md §Crash report redaction` and `../../anti-patterns.md §Section A Privacy`. + +--- + +## 9. CI Integration for Symbol Upload + +Symbols must never be stored in git. Upload to the vendor on every CI build that produces a release artifact. + +Conceptual GitHub Actions workflow (adapt per vendor): + +```yaml +# .github/workflows/release.yml (conceptual — adapt version and paths) +- name: Build iOS release + run: xcodebuild archive -scheme MyApp -archivePath build/MyApp.xcarchive + +- name: Upload dSYM to Sentry + env: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + SENTRY_ORG: my-org + SENTRY_PROJECT: my-app-ios + run: | + sentry-cli releases propose-version + sentry-cli upload-dif build/MyApp.xcarchive/dSYMs \ + --include-sources + +- name: Tag release with service.version + run: | + sentry-cli releases new "${GITHUB_REF_NAME}" + sentry-cli releases set-commits "${GITHUB_REF_NAME}" --auto + sentry-cli releases finalize "${GITHUB_REF_NAME}" +``` + +For Android, replace the dSYM upload step with a Gradle task invocation (`./gradlew :app:uploadSentryProguardMappings`) or `sentry-cli upload-proguard --android`. + +Cross-reference `../../observability-as-code.md` for observability-as-code patterns and repository structure for symbol upload configuration. + +--- + +## 10. ANR and Hang Detection + +ANR and hangs are near-crash events that degrade UX without generating a fatal crash report. They require distinct detection and alerting strategies. + +### Android ANR + +- Definition: main thread unresponsive for more than 5 seconds during input dispatch or 200 ms during broadcast. +- Detection sources: + - `ApplicationExitInfo` API (Android 11+): query ANR reasons from the OS after the fact. + - `ANRWatchDog` library: third-party watchdog thread that detects main-thread stalls in-process. + - Sentry, Embrace, Datadog: vendor SDK ANR detection via watchdog thread. +- Alert threshold: Google Play considers an ANR rate >= 0.47% of sessions as bad behavior, which can suppress the app in search results or trigger a Play Console warning. + +### iOS Hang + +- Definition: main thread blocked for more than 250 ms (Xcode Organizer threshold). +- Detection sources: + - Xcode Organizer → Hangs report (aggregated from devices with diagnostic sharing enabled). + - Sentry App Hang Detection SDK: configurable threshold, default 2 seconds; reports hangs as separate issue type. + - MetricKit: `MXHangDiagnostic` (iOS 14+) for on-device hang diagnostics. +- Alert threshold: no Google Play equivalent; apply product-specific target (e.g., < 0.1% of sessions for consumer apps). + +--- + +## 11. Matrix Cross-References (L7 Row) + +These cells from `../../matrix.md` are directly informed by this file: + +| Matrix cell | Artifact | Note | +|-------------|---------|------| +| L7 × multi-tenant × logs | Per-tenant crash rate segmentation | `tenant.id` as custom key in crash context | +| L7 × release × traces | Release-tagged crash correlates canary rollback | `service.version` on crash event + span `trace_id` linkage | +| L7 × privacy × logs | Crash report redaction before send | `beforeSend` hook; allowlist custom keys | +| L7 × cross-application × traces | Backend `trace_id` in crash context | Sentry / Datadog: attach active `trace_id` to crash report for backend span lookup | + +--- + +## 12. Anti-Patterns + +Candidates for `../../anti-patterns.md §Section G — Crash Analytics`: + +| Anti-pattern | Impact | Remediation | +|-------------|--------|------------| +| Crash report contains `user.email`, auth tokens, or card numbers without redaction filter | GDPR / PIPA breach; PII in vendor SaaS storage | Implement `beforeSend` allowlist; cross-ref Section 8 | +| Symbol upload not automated in CI | Stack traces unreadable in production | Add symbol upload step to release pipeline; cross-ref Section 9 | +| No release marker → cannot correlate deploy to crash spike | Crash spike investigation requires manual git bisect | Set `service.version` on every build; emit release event at deploy | +| ANR rate unmonitored on Android | Google Play app suppression; degraded store ranking | Add ANR rate metric; alert at 0.47% of sessions | +| Single aggregate CFR target ignoring device-tier or OS-version distribution | p10 device users (low-end hardware) masked by p90 average | Segment CFR by `device.model`, `os.version`, and network type | +| dSYM stored in git LFS instead of vendor symbol storage | Git LFS cost; symbol-build UUID drift | Use vendor upload; never commit dSYMs | + +--- + +## References + + + + +### Primary sources + +- Firebase Crashlytics: <https://firebase.google.com/docs/crashlytics> +- Sentry iOS: <https://docs.sentry.io/platforms/apple/> +- Sentry Android: <https://docs.sentry.io/platforms/android/> +- Sentry React Native: <https://docs.sentry.io/platforms/react-native/> +- Sentry Flutter: <https://docs.sentry.io/platforms/flutter/> +- Xcode Organizer Hangs: <https://developer.apple.com/documentation/xcode/understanding-user-interface-responsiveness> +- Android ANR: <https://developer.android.com/topic/performance/vitals/anr> +- Android ApplicationExitInfo: <https://developer.android.com/reference/android/app/ApplicationExitInfo> +- OTel `exception.*` semconv (Stable): <https://opentelemetry.io/docs/specs/semconv/exceptions/> diff --git a/.agents/skills/oma-observability/resources/layers/L7-application/mobile-rum.md b/.agents/skills/oma-observability/resources/layers/L7-application/mobile-rum.md new file mode 100644 index 0000000..8473f36 --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/L7-application/mobile-rum.md @@ -0,0 +1,283 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +notes: + - "OTel mobile semconv still experimental; vendor-native SDKs dominate in 2026" +--- + +# Mobile RUM — L7 Application Layer + +--- + +## 1. Scope + +**In scope:** Native mobile app Real User Monitoring for iOS (Swift/Obj-C), Android (Kotlin/Java), React Native, and Flutter. Covers performance, errors, app lifecycle events, network egress observability, offline telemetry queuing, and battery impact. + +**Out of scope (see sibling files):** +- Mobile crash analytics specifics (symbolication pipeline, Crash-Free Rate computation, dSYM/ProGuard upload) → `crash-analytics.md`. This file references symbolication patterns but does not own the full pipeline. +- Mobile web RUM running inside an in-app WebView → `web-rum.md`. WebView telemetry is browser-runtime based and follows Web RUM patterns, not native SDK patterns. + +--- + +## 2. OTel Mobile SDK Status (as of 2026-Q2) + +OpenTelemetry mobile SDKs are less mature than backend or web SDKs. Both are experimental status: + +| Platform | OTel SDK | Status | +|----------|----------|--------| +| Android | `opentelemetry-android` + `opentelemetry-java-instrumentation` | Experimental | +| iOS | `opentelemetry-swift` | Experimental | +| React Native | No official OTel SDK | Vendor SDK only | +| Flutter | No official OTel SDK | Vendor SDK only | + +In practice, mobile teams use **vendor-native SDKs** and map signals to OTel concepts where possible. This file uses vendor SDKs as the primary path and notes where OTel semconv applies. + + +--- + +## 3. Offline-First Telemetry Queuing + +Offline queuing is the defining mobile-specific concern. Mobile networks are intermittent, metered, roaming-constrained, and subject to airplane mode. A telemetry SDK that drops events on network failure is unsuitable for mobile. + +### Queue design requirements + +| Constraint | Recommended value | Rationale | +|------------|------------------|-----------| +| Disk storage cap | 1–5 MB | Balance completeness vs. user storage impact | +| Event TTL | 24–72 hours | Drop stale events before they mislead dashboards | +| Duplicate suppression | Idempotency key per event | Retry on reconnect must not produce duplicates | +| Backoff strategy | Exponential + jitter | Avoid thundering herd on reconnect | +| Send trigger | Network reachability change + foreground transition | Align with OS connectivity APIs | + +### Quality considerations + +- **Event ordering:** Queue flush order may differ from emit order. Attach a local device timestamp (`observed_time_unix_nano`) to every event. Server must tolerate out-of-order arrivals and sort by device timestamp for waterfall reconstruction. +- **Clock skew:** Device clocks may drift relative to server. Cross-reference `../../meta-observability.md §Clock skew` for handling. Do not rely on server receipt time as the canonical event timestamp. +- **Server tolerance:** OTLP backends (and vendor equivalents) are designed for out-of-order ingestion. Verify TTL settings on the collector pipeline so stale events are dropped before indexing. + +### Privacy: queued data on disk + +Events queued to disk are **PII at rest on user devices**. Requirements: +- Encrypt the queue using platform-native key storage: iOS Keychain, Android Keystore. +- Apply the same field-level redaction to queued events as to events sent immediately (mask email, card number, password fields before write, not before send). +- Anti-pattern: unencrypted telemetry queue on device — see Section 12. + +--- + +## 4. App Lifecycle Events + +App lifecycle events anchor session boundaries and correlate performance observations to foreground/background transitions. Each platform exposes different lifecycle hooks; standardize on a common semconv attribute set. + +### Lifecycle hook mapping + +| Event | iOS | Android | React Native | Flutter | +|-------|-----|---------|--------------|---------| +| App becomes active / foreground | `applicationDidBecomeActive` | `Activity.onResume` / `ProcessLifecycleOwner.ON_START` | `AppState → 'active'` | `AppLifecycleState.resumed` | +| App enters background | `applicationDidEnterBackground` | `Activity.onPause` / `ProcessLifecycleOwner.ON_STOP` | `AppState → 'background'` | `AppLifecycleState.paused` | +| App will terminate | `applicationWillTerminate` | `Activity.onDestroy` | (no guaranteed callback) | `AppLifecycleState.detached` | +| Memory pressure | `didReceiveMemoryWarning` | `onTrimMemory(level)` | N/A (JS heap managed) | N/A | + +### Semconv (experimental) + +OTel mobile semconv is in progress as of 2026-Q2. Use custom attributes pending stabilization: + +``` +app.lifecycle.event = "foreground" | "background" | "terminate" | "memory_warning" +app.session.id = <UUID generated at foreground entry, reset on cold start> +device.model.name = <product model string> +os.name = "ios" | "android" +os.version = <platform version string> +``` + +Swift one-liner (OTel span on foreground): +```swift +tracer.spanBuilder(spanName: "app.lifecycle").setSpanKind(.internal) + .setAttribute("app.lifecycle.event", "foreground").startSpan().end() +``` + +Kotlin one-liner (OTel span on resume): +```kotlin +tracer.spanBuilder("app.lifecycle").setSpanKind(SpanKind.INTERNAL) + .setAttribute("app.lifecycle.event", "foreground").startSpan().end() +``` + +--- + +## 5. Battery Impact of RUM SDK + +RUM SDKs are frequently among the largest third-party battery consumers on a mobile device, due to background network pings, location access, and sensor polling. + +### Mitigation strategies + +| Technique | iOS mechanism | Android mechanism | +|-----------|--------------|-------------------| +| Batch sends | `BGAppRefreshTask` (15-min minimum OS interval) | `WorkManager` with network constraint | +| Defer on low battery | `ProcessInfo.isLowPowerModeEnabled` | `BatteryManager.BATTERY_STATUS_DISCHARGING` + `PowerManager.isPowerSaveMode` | +| Skip cellular sends (optional) | `NWPathMonitor` check for `.cellular` interface | `ConnectivityManager.getNetworkCapabilities` → `NET_CAPABILITY_NOT_METERED` | +| Respect background execution limits | `BGAppRefreshTask` registration | Doze mode + App Standby bucket awareness | +| Sampling reduction on low signal | Custom rule in SDK config | Custom rule in SDK config | + +### Measurement tools + +- iOS: Xcode Energy Organizer + Instruments Energy Log +- Android: Battery Historian (<https://developer.android.com/topic/performance/power/battery-historian>) + +### Carbon operations consideration + +Mobile device CPU and network activity contribute to device-level energy consumption. For green observability practices, minimize telemetry flush frequency and batch size. Server-side carbon impact is tracked via Kepler (CNCF Sandbox) using CPU and network metrics on the collector nodes — mobile-side impact is the device CPU time and radio-on duration during flush. + +--- + +## 6. Network Egress Observability + +| Concern | Guidance | +|---------|---------| +| Metered data awareness | Detect cellular vs. Wi-Fi; warn or throttle SDK sends on cellular by default (opt-in to always-on) | +| Request coalescing | Combine multiple queued spans into a single OTLP export request per flush cycle | +| Certificate pinning | Implement mTLS or certificate pinning for telemetry endpoints beyond mesh-level TLS; use platform SecTrustEvaluate / TrustManager accordingly | +| PII leakage via URL | Strip query parameters containing tokens, emails, or IDs before recording `http.url` in spans | + +Network request spans follow OTel HTTP client semconv (`http.request.method`, `server.address`, `http.response.status_code`, `network.protocol.name`). These are Stable semconv as of 1.27.0. + +--- + +## 7. iOS vs Android SDK Comparison (as of 2026-Q2) + +### iOS SDKs + +| SDK | Coverage | Platforms | Notes | +|-----|----------|-----------|-------| +| Sentry Cocoa | Errors, performance, session replay, profiling | iOS, macOS, tvOS, watchOS, visionOS | Largest platform breadth; auto-instrumentation for URLSession | +| Datadog iOS RUM | RUM, traces, logs, error tracking | iOS, tvOS | Strong OTel bridge; custom view timing | +| Firebase Crashlytics | Crashes, breadcrumbs | iOS, macOS | Free; Google ecosystem; no RUM performance. ANR is Android-only; iOS Hang metric is a separate concept handled by MetricKit / Embrace | +| Embrace | Startup, Hang (iOS) / ANR (Android), network spans, session replay | iOS | Startup-time focus; per-session timeline view | +| `opentelemetry-swift` | Traces, metrics (experimental) | iOS, macOS | OTel-native; no auto-instrumentation yet | + +### Android SDKs + +| SDK | Coverage | Notes | +|-----|----------|-------| +| Sentry Android | Errors, performance, ANR, profiling, session replay | Auto-instrumentation via Gradle plugin; OkHttp + Retrofit | +| Datadog Android RUM | RUM, traces, logs, error tracking | Kotlin-first; OTel exporter available | +| Firebase Crashlytics | Crashes, ANR, breadcrumbs | Free; no RUM performance | +| Embrace | Startup, ANR, network, session replay | App Standby bucket reporting | +| `opentelemetry-android` | Traces, metrics (experimental) | OTel-native; auto-instrumentation via Bytecode plugin (experimental) | + +Vendor category cross-reference: `../../vendor-categories.md §Crash Analytics` for full category taxonomy and how-to-choose criteria. + +--- + +## 8. React Native and Flutter + +### React Native + +React Native spans both the JS bridge and native layers. A single SDK must handle JS errors, native crashes, and the bridge overhead. + +| SDK | Coverage | +|-----|----------| +| Sentry React Native | JS errors, native crashes, performance, session replay. Source map upload + native dSYM (iOS) / ProGuard (Android) combined via `sentry-cli`. | +| Datadog RN SDK | RUM, traces, logs. OTel bridge for trace propagation. | + +Symbolication: JS source maps + native symbols must both be uploaded per release build. See `crash-analytics.md` for the full symbolication pipeline. + +### Flutter + +Flutter compiles to native ARM; the Dart VM's `--split-debug-info` flag separates debug symbols for upload. + +| SDK | Coverage | +|-----|----------| +| Sentry Flutter | Dart errors, native crashes, performance. `--split-debug-info` + source map + native symbol upload via `sentry-dart-plugin`. | +| Firebase Crashlytics Flutter plugin | Dart and native crashes. Free; no RUM performance metrics. | + +--- + +## 9. W3C Trace Context Propagation: Mobile to Backend + +Mobile apps are the root of distributed traces that span multiple backend services. Injecting `traceparent` on outbound requests enables end-to-end correlation. + +**Propagation flow:** + +``` +Mobile App (root span, SpanKind.CLIENT) + |-- traceparent header injected on HTTP/gRPC call + v +Backend API Gateway (continues trace, creates child spans) + |-- downstream services inherit trace context + v +Database / Queue / External API +``` + +**Implementation rules:** + +1. The mobile SDK intercepts outbound HTTP requests and injects `traceparent` (and optionally `tracestate`) automatically. Configure the propagator in SDK init, not at call sites. +2. `tracestate` carries the mobile sampling decision. Backend tail samplers MUST respect the `sampled` flag to avoid dropping traces that the mobile side intends to keep. +3. The mobile span is `SpanKind.CLIENT`. The backend root span receiving the call is `SpanKind.SERVER`. This pairing is required by OTel span kind semantics. +4. **Session span challenge:** A mobile user session may span minutes or hours with many backend requests. Each HTTP call starts a new child span under the session root span. Store the session root span context in memory (not disk) to avoid stale context on OS-kill and restart. +5. Cross-reference: `../../boundaries/cross-application.md` for the full propagator matrix across service, domain, and vendor boundaries. + +W3C Trace Context reference: `../../standards.md §W3C Trace Context`. + +--- + +## 10. Performance Metrics (Mobile-Native) + +| Metric | Definition | Target | +|--------|-----------|--------| +| Cold start time | Process launch to first interactive frame | < 2 s (iOS Instruments: `os_signpost`; Android: `reportFullyDrawn`) | +| Warm start time | App resumed from background to interactive | < 1 s | +| Hot start time | Activity recreated (Android) | < 500 ms | +| Frame rendering | Frame drop rate (jank) | iOS ≥ 60 fps (`CADisplayLink`); Android `FrameMetricsAggregator` Jank < 0.1% | +| ANR rate | Application Not Responding (main thread blocked > 5 s) | Android-specific; < 0.47% (Play Console threshold) | +| iOS Hang rate | Non-responsive main thread > 250 ms | < 0.1% (MetricKit `MXHangDiagnostic`) | +| Network latency | Per-request round-trip time | Monitor p50/p95/p99 histograms; no universal target | +| Crash-Free Session Rate | Sessions without a crash / total sessions | Cross-reference `crash-analytics.md §CFR` | + +Semconv for mobile startup spans: use `app.lifecycle.event = "cold_start"` with `app.startup.duration_ms` as a custom span attribute until OTel mobile semconv stabilizes. + +--- + +## 11. Matrix Cells (L7 Row, Mobile Slice) + +These cells supplement the L7 rows in `../../matrix.md` with mobile-specific caveats: + +| Cell | Symbol | Detail | +|------|--------|--------| +| L7 × multi-tenant × metrics | ⚠️ | `tenant.id` on mobile is typically derived from the authenticated user ID, not a direct tenant header. Map `user.id → tenant.id` server-side in the pipeline; avoid attaching `tenant.id` as a metric label on-device (cardinality risk). | +| L7 × cross-application × traces | ✅ | Mobile → backend `traceparent` propagation; mobile span is the root CLIENT span. See Section 9. | +| L7 × slo × metrics | ✅ | Mobile-side SLIs: cold start time p95, ANR rate, Hang rate. Define SLO targets in OpenSLO YAML referencing mobile metrics ingested via OTLP. Cross-reference `../../boundaries/slo.md`. | +| L7 × privacy × logs | ✅ | On-device queued events contain PII at rest. Encryption via Keychain/Keystore required. Field-level masking of card numbers, emails, and passwords before write. Cross-reference `../../signals/privacy.md`. | + +--- + +## 12. Anti-Patterns (Candidates for `../../anti-patterns.md §Section G`) + +| Anti-pattern | Risk | Fix | +|-------------|------|-----| +| Telemetry queue stored unencrypted on device | PII at rest exposed if device is lost or compromised | Encrypt queue using iOS Keychain / Android Keystore before writing | +| Heavy RUM SDK flush on every event without batching | Battery drain → user complaints → app uninstall | Batch flushes; align with `BGAppRefreshTask` / `WorkManager` cadence | +| No event TTL on queue | Stale events (hours or days old) arrive at backend; mislead dashboards | Set TTL (24–72 h); drop on exceed before send | +| Missing `traceparent` injection on outbound HTTP | Client-server correlation broken; mobile sessions invisible in backend traces | Configure SDK HTTP interceptor at init; verify with network proxy tool | +| Session replay without PII field masking | Card numbers, emails, passwords captured in replay | Enable SDK masking rules on input fields; test with automated replay review | +| Using server receipt time as event timestamp | Clock skew between device and server corrupts waterfall ordering | Store `observed_time_unix_nano` (device local time) on every event at emit time | + +--- + +## References + + +- OTel Swift SDK: <https://opentelemetry.io/docs/languages/swift/> +- OTel Android SDK: <https://github.com/open-telemetry/opentelemetry-android> +- Sentry Cocoa: <https://docs.sentry.io/platforms/apple/> +- Sentry Android: <https://docs.sentry.io/platforms/android/> +- Datadog iOS RUM: <https://docs.datadoghq.com/real_user_monitoring/ios/> +- Datadog Android RUM: <https://docs.datadoghq.com/real_user_monitoring/android/> +- Firebase Crashlytics: <https://firebase.google.com/docs/crashlytics> +- iOS Background Tasks: <https://developer.apple.com/documentation/backgroundtasks> +- Android App Standby / Doze: <https://developer.android.com/topic/performance/appstandby> +- W3C Trace Context L1: <https://www.w3.org/TR/trace-context/> +- Battery Historian: <https://developer.android.com/topic/performance/power/battery-historian> + +### Primary sources + +- <https://opentelemetry.io/docs/languages/swift/> +- <https://github.com/open-telemetry/opentelemetry-android> diff --git a/.agents/skills/oma-observability/resources/layers/L7-application/web-rum.md b/.agents/skills/oma-observability/resources/layers/L7-application/web-rum.md new file mode 100644 index 0000000..0f358cc --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/L7-application/web-rum.md @@ -0,0 +1,350 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +tools: + - "@opentelemetry/sdk-trace-web: 1.x (stable); browser instrumentations: some experimental" +notes: + - "web-vitals JS library: 4.x (2024); INP replaced FID: March 2024" +--- + +# Web RUM — Real User Monitoring + +## 1. Scope + +This file covers browser-side Real User Monitoring (RUM) for web applications. + +**In scope:** performance signals (Core Web Vitals, Navigation Timing, custom marks), error tracking (`window.onerror`, unhandled promise rejections, source map correlation), user interaction (click, navigation, long tasks), 3rd-party script impact and CSP violation reporting, synthetic monitoring (scheduled browser/API probes), client-to-server trace correlation via W3C `traceparent`, session replay (vendor overview). + +**Out of scope:** mobile RUM (see `mobile-rum.md`); native app crash analytics (see `crash-analytics.md`). + +--- + +## 2. Core Web Vitals (2024 Update) + +Google updated the Core Web Vitals (CWV) set in March 2024. **INP (Interaction to Next Paint) replaced FID (First Input Delay)** as the official responsiveness metric. FID is deprecated; do not report it in new dashboards or SLOs. + +Reference implementation: `web-vitals` JS library (`google/web-vitals`, v4.x). + +### Official CWV Targets + +| Metric | Full name | Dimension | Good | Needs improvement | Poor | +|--------|-----------|-----------|------|-------------------|------| +| **LCP** | Largest Contentful Paint | Loading performance | ≤ 2.5 s | 2.5 s – 4.0 s | > 4.0 s | +| **INP** | Interaction to Next Paint | Responsiveness (replaces FID) | ≤ 200 ms | 200 ms – 500 ms | > 500 ms | +| **CLS** | Cumulative Layout Shift | Visual stability | ≤ 0.1 | 0.1 – 0.25 | > 0.25 | + +### Additional Operational Signals (not official CWV) + +| Metric | Meaning | Typical target | +|--------|---------|---------------| +| TTFB | Time to First Byte — server + network latency | ≤ 800 ms | +| FCP | First Contentful Paint — when first content is painted | ≤ 1.8 s | +| TTI | Time to Interactive — main thread unblocked | ≤ 3.8 s (3G) | + +Use `web-vitals` to collect all signals with a uniform API: + +```js +import { onLCP, onINP, onCLS, onFCP, onTTFB } from 'web-vitals'; + +function sendToAnalytics({ name, value, rating, id }) { + // forward to your RUM backend or OTel OTLP endpoint + navigator.sendBeacon('/rum', JSON.stringify({ name, value, rating, id })); +} + +onLCP(sendToAnalytics); +onINP(sendToAnalytics); +onCLS(sendToAnalytics); +onFCP(sendToAnalytics); +onTTFB(sendToAnalytics); +``` + +SLI recommendation: use **p75** for each CWV metric as the SLI value (aligns with Google Search Console scoring). Cross-reference: `../../boundaries/slo.md` for SLO burn-rate alert configuration. + +--- + +## 3. Browser OpenTelemetry SDK + +The OTel JavaScript SDK provides browser instrumentation for traces and metrics. Logs from the browser are currently in progress (no stable browser LogRecord exporter as of semconv 1.27.0). + +**Stability note:** `@opentelemetry/sdk-trace-web` and `@opentelemetry/sdk-metrics` are stable. Some browser instrumentation packages (e.g., Core Web Vitals plugin for OTel) remain experimental. Do not build production SLOs on experimental instrumentations without a fallback to `web-vitals` directly. + +### Typical Instrumentations + +| Package | What it instruments | Status | +|---------|-------------------|--------| +| `@opentelemetry/instrumentation-document-load` | Navigation timing, resource timing | Stable | +| `@opentelemetry/instrumentation-user-interaction` | Click events → spans | Stable | +| `@opentelemetry/instrumentation-fetch` | `fetch()` calls + `traceparent` injection | Stable | +| `@opentelemetry/instrumentation-xml-http-request` | `XMLHttpRequest` + header injection | Stable | + +### Minimal SDK Initialization + +```js +import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'; +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { registerInstrumentations } from '@opentelemetry/instrumentation'; +import { DocumentLoadInstrumentation } from '@opentelemetry/instrumentation-document-load'; +import { UserInteractionInstrumentation } from '@opentelemetry/instrumentation-user-interaction'; +import { FetchInstrumentation } from '@opentelemetry/instrumentation-fetch'; +import { W3CTraceContextPropagator } from '@opentelemetry/core'; +import { CompositePropagator } from '@opentelemetry/core'; +import { Resource } from '@opentelemetry/resources'; + +const provider = new WebTracerProvider({ + resource: new Resource({ + 'service.name': 'my-web-app', + 'service.version': import.meta.env.VITE_RELEASE_VERSION, // CI-injected + 'deployment.environment': import.meta.env.MODE, + }), +}); + +provider.addSpanProcessor( + new BatchSpanProcessor( + new OTLPTraceExporter({ url: 'https://otel-collector.example.com/v1/traces' }) + ) +); + +provider.register({ + propagator: new CompositePropagator({ + propagators: [new W3CTraceContextPropagator()], + }), +}); + +registerInstrumentations({ + instrumentations: [ + new DocumentLoadInstrumentation(), + new UserInteractionInstrumentation(), + new FetchInstrumentation({ + // Allow traceparent injection only to trusted origins (see §5) + propagateTraceHeaderCorsUrls: [/https:\/\/api\.example\.com/], + }), + ], +}); +``` + +**Session and view identifier strategy:** Generate a `session.id` UUID on tab open (stored in `sessionStorage`) and a `view.id` UUID on each route change. Attach both as span attributes. This enables session-level RUM aggregation without a vendor SDK. + +--- + +## 4. Vendor Options + +Vendors below are examples as of 2026-Q2. This is not a registry — verify currency at `https://landscape.cncf.io` and `../../vendor-categories.md §RUM category`. + +| Vendor | Bundle size | Session replay | OTel-based | Full-stack correlation | Notes | +|--------|------------|---------------|-----------|----------------------|-------| +| **Sentry Browser** | Medium | Yes (opt-in) | Partial | Strong | Error-first; release tracking; delegation: `getsentry/sentry-sdk-setup` | +| **Grafana Faro** | Lightweight | No | Yes | Grafana stack | OSS; emits OTel-compatible signals; pairs with LGTM+ | +| **Datadog RUM** | Heavy | Yes | Partial | APM link | Strongest APM-to-RUM pivot; `allowedTracingUrls` CORS config | +| **Bugsnag** | Light | No | No | Partial | Stability-focused; good grouping; SmartBear owned | + +For vendor-specific setup, delegate to the vendor skill listed or use `oma-search --docs "{vendor} browser RUM setup"`. + +--- + +## 5. Client-to-Server Error Correlation + +**Problem:** A server-side 5xx error causes the browser to retry. Without shared trace context, each retry appears as an independent request in backend traces. The root cause (one bad deploy) generates a cascade that looks like independent failures. + +**Solution:** emit the same `trace_id` on both sides using W3C `traceparent`. + +The `FetchInstrumentation` (OTel JS) and Datadog RUM inject `traceparent` automatically into outbound requests — but only for origins listed in the CORS allowlist configuration. + +```js +// OTel JS SDK — propagateTraceHeaderCorsUrls +new FetchInstrumentation({ + propagateTraceHeaderCorsUrls: [ + /https:\/\/api\.example\.com/, + /https:\/\/gateway\.example\.com/, + ], +}); + +// Datadog RUM — allowedTracingUrls +datadogRum.init({ + allowedTracingUrls: [ + { match: 'https://api.example.com', propagatorTypes: ['tracecontext'] }, + ], +}); +``` + +The server must emit the same `trace_id` in its own spans. When the browser console shows an error, the `trace_id` links directly to the backend trace in Sentry Performance, Datadog APM, or any OTel-compatible backend. + +**Anti-pattern to avoid:** client retry loop on 5xx without a circuit breaker. Retrying immediately amplifies server load. Implement exponential backoff + circuit breaker in the fetch layer before enabling distributed tracing — otherwise the correlation data documents a cascading failure, not a single event. Cross-reference: anti-patterns §Section G below and `../../signals/traces.md` for retry trace patterns. + +--- + +## 6. Third-Party Scripts and CSP + +3rd-party scripts are the dominant cause of LCP regression and a primary XSS / supply-chain attack vector. Observability of 3rd-party scripts requires both performance attribution and security violation reporting. + +### Content Security Policy Violation Reporting + +```http +Content-Security-Policy: default-src 'self'; + script-src 'self' https://cdn.trusted-vendor.com; + report-to csp-violations; + report-uri https://csp-reports.example.com/collect + +Reporting-Endpoints: csp-violations="https://csp-reports.example.com/collect" +``` + +> `report-uri` is the legacy fallback directive for browsers that do not yet support the Reporting API Level 1 (`report-to`). Firefox gained full `Reporting-Endpoints` support in 2023; keeping both ensures older stable channels still deliver violation reports. + +CSP violations are reported to your server endpoint as JSON. Pipe them to your log backend and alert on new `blocked-uri` origins — these indicate either a new 3rd-party load attempt or a supply-chain injection. + +### Subresource Integrity (SRI) + +Pin the exact hash of any 3rd-party CDN script to detect tampered deliveries: + +```html +<script + src="https://cdn.trusted-vendor.com/lib.min.js" + integrity="sha384-<base64-hash>" + crossorigin="anonymous" +></script> +``` + +### Performance Attribution + +Use the Resource Timing API (`PerformanceObserver` type `resource`, `buffered: true`) to attribute LCP and INP delays to specific 3rd-party origins by comparing `new URL(entry.name).origin` against `window.location.origin`. Report per-origin `entry.duration` to your RUM backend. + +Cross-reference: `../../signals/privacy.md` for 3rd-party cookie and tracking signal rules; `../../anti-patterns.md §Section G` for the "unmonitored 3rd-party script" anti-pattern. + +--- + +## 7. Synthetic Monitoring + +Synthetic monitoring provides an "outside-in" view that complements RUM. RUM shows what real users experience; synthetic shows what a deterministic probe experiences from a specific region, at any time — including before any real user visits. + +| Tool | Type | Best for | +|------|------|---------| +| **Checkly** | Playwright-based browser checks | Full user-journey verification; integrates with CI/CD | +| **Grafana k6** | Script-based load + synthetic | Load testing + baseline synthetic probes in one tool | +| **Prometheus Blackbox Exporter** | HTTP/TCP/ICMP probe | Lightweight availability checks; PromQL-native alerting | +| **Datadog Synthetics** | Browser + API | Unified with Datadog RUM; managed SaaS | +| **Pingdom** | Managed HTTP probe | Simple uptime monitoring; low operational overhead | + +### When to Use RUM vs Synthetic + +| Scenario | Use | +|----------|-----| +| Understand real user performance distribution | RUM | +| Detect issues before users notice | Synthetic | +| Low-traffic page that may have issues | Synthetic (RUM has too few samples) | +| Geo-distribution of performance | Both (synthetic per region + RUM aggregated) | +| SLO availability measurement | Synthetic (deterministic; not subject to sampling) | +| Investigating a real user complaint | RUM session + trace correlation | + +Recommendation: run synthetic probes from at least 3 regions. Alert on synthetic probe failures with a tighter SLA than RUM-based alerts (synthetic = canary; RUM = ground truth). + +--- + +## 8. Session Replay + +Session replay records DOM mutations to reconstruct a visual playback. Vendors include Sentry Session Replay, Datadog Session Replay (both mask all inputs by default), FullStory, Hotjar, and LogRocket. + +**Privacy requirements:** +- Email addresses, credit card fields, and `input[type=password]` MUST be masked client-side before the payload is sent. Do not rely on server-side masking alone. +- Session replay consent must be tied to the cookie consent flow (GDPR Article 6(1)(a)). +- Cross-reference: `../../signals/privacy.md` for replay sanitization rules and PII classification. + +--- + +## 9. Error Tracking + +```js +// Global error handler +window.addEventListener('error', (event) => { + myRumSdk.captureException(event.error, { + 'span.id': currentSpan?.spanContext().spanId, + 'trace.id': currentSpan?.spanContext().traceId, + }); +}); + +// Unhandled promise rejections +window.addEventListener('unhandledrejection', (event) => { + myRumSdk.captureException(event.reason); +}); +``` + +**Source map uploads:** Minified production bundles produce unreadable stack traces. Upload source maps to your error tracking vendor on every release. + +```bash +# Example: Sentry CLI source map upload (run in CI after build) +sentry-cli sourcemaps inject ./dist +sentry-cli sourcemaps upload --org my-org --project my-web ./dist +``` + +Cross-reference: `../../boundaries/release.md` for CI integration patterns — source map upload should be gated on the same pipeline step as container image push. + +--- + +## 10. Browser-Specific Observability APIs + +| API | Purpose | +|-----|---------| +| `performance.mark()` / `performance.measure()` | Custom timing for business-critical flows (e.g., checkout duration as CWV supplement) | +| Long Task API (`PerformanceObserver` type `longtask`) | Detect main-thread blocking events > 50 ms; correlate with INP failures | +| Navigation Timing API (`performance.getEntriesByType('navigation')`) | Full page load breakdown: DNS, TCP, TLS, TTFB, DOMContentLoaded, load | + +Use `performance.mark('flow-start')` / `performance.measure('flow-duration', 'flow-start', 'flow-end')` to emit named timing spans that complement automated CWV collection. + +--- + +## 11. Matrix Coverage (L7 Row, Web Slice) + +These cells from `../../matrix.md` are the primary drivers for this file: + +| Matrix cell | Symbol | Artifact | +|------------|--------|---------| +| L7 × multi-tenant × metrics | ✅ | Per-tenant CWV distribution (LCP p75, INP p75, CLS p75 segmented by `tenant.id`) | +| L7 × cross-application × traces | ✅ | Primary: `traceparent` propagated via `propagateTraceHeaderCorsUrls` / `allowedTracingUrls` | +| L7 × SLO × metrics | ✅ | CWV as SLI — LCP p75 ≤ 2.5s, INP p75 ≤ 200ms, CLS p75 ≤ 0.1 (see `../../boundaries/slo.md`) | +| L7 × release × traces | ✅ | `service.version` on spans carries frontend build SHA + backend version for correlated release analysis | +| L7 × privacy × logs | ✅ | PII masking in error stack traces and session replay; no `user.email` in metric labels | + +--- + +## 12. Anti-Patterns (Section G — Frontend/Mobile Candidates) + +These are candidates for `../../anti-patterns.md §Section G Frontend/Mobile`: + +| # | Anti-pattern | Impact | Remedy | +|---|-------------|--------|--------| +| G1 | 3rd-party script loaded without CSP monitoring | Silent XSS or supply-chain injection; no alert | Add `Content-Security-Policy` with `report-to`; pipe violations to log backend | +| G2 | Source maps not uploaded to error vendor | Stack traces are unreadable minified symbols in production | Upload source maps on every CI release pipeline step | +| G3 | Client retry loop on 5xx without circuit breaker | Backend saturation cascade from amplified retry storm | Implement exponential backoff + client-side circuit breaker before enabling trace correlation | +| G4 | `user.email` as metric label | Cardinality explosion + PII violation (GDPR Article 5(1)(c)) | Use opaque `user.id` (stable hash) or remove user dimension from metrics | +| G5 | FID still reported in dashboards after March 2024 | Stale metric; no longer part of CWV; misleads SLO reviews | Replace FID with INP in all dashboards and SLO definitions | +| G6 | `propagateTraceHeaderCorsUrls` / `allowedTracingUrls` not configured | Browser CORS preflight rejects `traceparent` injection; client-server correlation silently broken | Add API origins to the SDK CORS allowlist configuration | +| G7 | Session replay without client-side PII masking | PII (email, card numbers) captured in replay payload before masking | Enable input masking in SDK config; do not rely on server-side redaction alone | + +--- + +## 13. References + + +Internal cross-references: +- `../../standards.md` — normative semconv stability tiers and W3C Trace Context requirements +- `../../matrix.md` — full 112-cell coverage map (L7 row) +- `../../vendor-categories.md` — RUM vendor category taxonomy and delegation targets +- `../../anti-patterns.md` — full anti-pattern registry (Section G: Frontend/Mobile) +- `../../boundaries/slo.md` — OpenSLO definitions for CWV SLIs and burn-rate alerts +- `../../boundaries/release.md` — source map upload CI integration and `service.version` tagging +- `../../boundaries/cross-application.md` — full propagator compatibility matrix (for `traceparent` CORS allowlist guidance) +- `../../signals/privacy.md` — PII masking rules, 3rd-party tracking signals, replay sanitization +- `../../signals/traces.md` — server-side trace patterns for backend correlation +- `../mesh.md` — mesh propagator headers (cross-reference for `traceparent` propagation chain) +- `mobile-rum.md` — mobile RUM (offline-first queuing, battery, app lifecycle) +- `crash-analytics.md` — native crash symbolication and CFR tracking + +## References + +- Core Web Vitals (2024): <https://web.dev/articles/vitals> +- INP replaces FID: <https://web.dev/blog/inp-cwv> +- `web-vitals` JS library: <https://github.com/GoogleChrome/web-vitals> +- OTel JS browser instrumentation: <https://opentelemetry.io/docs/languages/js/getting-started/browser/> +- OTel frontend instrumentation guide: <https://elastic.co/observability-labs/blog/web-frontend-instrumentation-with-opentelemetry> +- Datadog RUM browser SDK: <https://docs.datadoghq.com/real_user_monitoring/browser/> +- Sentry Browser SDK: <https://docs.sentry.io/platforms/javascript/> +- Grafana Faro: <https://grafana.com/oss/faro/> diff --git a/.agents/skills/oma-observability/resources/layers/mesh.md b/.agents/skills/oma-observability/resources/layers/mesh.md new file mode 100644 index 0000000..4edc5b4 --- /dev/null +++ b/.agents/skills/oma-observability/resources/layers/mesh.md @@ -0,0 +1,366 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +tools: + - "Istio: 1.22+; Envoy: 1.29+; OTel Operator: v0.100+" +--- + +# Service Mesh Observability + +## 1. Why the Service Mesh Has Its Own Observability File + +The service mesh is classified as a **L4-L7 hybrid layer** in the taxonomy defined in `resources/standards.md §OSI Boundary Decision`. It differs from the adjacent L4-transport layer in a fundamental way: the mesh proxy (sidecar or ambient agent) operates at the HTTP/gRPC framing level, not at raw TCP socket level. + +Three properties make mesh observability distinct enough to warrant a dedicated file: + +1. **Zero-code auto-instrumentation.** Sidecar injection (or ambient mode) intercepts all inbound and outbound traffic without requiring any change to application code. Metrics, access logs, and distributed trace spans are produced automatically from day one of mesh deployment. This contrasts with `layers/L4-transport.md`, where eBPF probes attach to kernel socket events and produce transport-centric (TCP RTT, retransmit) artifacts. + +2. **Proxy-centric, not transport-centric.** Mesh observability is Envoy-proxy-centric or Linkerd-proxy-centric. The primary artifacts are Envoy listeners/clusters, access log records, and HTTP/gRPC span metadata — not TCP segments or IP packets. `layers/L4-transport.md` covers kernel-level transport; this file covers proxy-level L7 framing. + +3. **mTLS identity as a first-class signal.** The mesh terminates mTLS at the sidecar, producing certificate identity (SPIFFE SVID), cipher suite, and expiry as observable attributes alongside each request. No other layer produces this security context natively. + +--- + +## 2. Service Mesh Options (as of 2026-Q2) + +| Mesh | CNCF Status | Proxy engine | Deployment modes | Primary differentiator | +|------|------------|-------------|-----------------|----------------------| +| **Istio** | Graduated | Envoy | Sidecar, Ambient (ztunnel + waypoint) | Richest Envoy telemetry; Telemetry API v2; OTLP direct export since 1.22 | +| **Linkerd** | Graduated | linkerd2-proxy (Rust) | Sidecar only | Lightweight; minimal CPU overhead; built-in mTLS; own propagator headers | +| **Envoy** | Graduated | Self | Standalone gateway or sidecar | Proxy engine underlying Istio; also deployed as standalone API gateway | +| **Consul Connect** | Not CNCF | Envoy (via xDS) | Sidecar | HashiCorp ecosystem; strong multi-datacenter support | +| **Kuma** | CNCF Sandbox | Envoy (via xDS) | Sidecar, Universal (VM) | Multi-cloud and multi-zone; Kong-backed | + +CNCF status source: <https://landscape.cncf.io> — verify quarterly. + +--- + +## 3. Native Telemetry from the Mesh + +### 3.1 Metrics: Envoy Golden Signals + +Envoy exposes golden signals per **listener** (inbound) and **cluster** (outbound). Istio enables a controlled subset by default to avoid cardinality explosion caused by per-endpoint label permutations. + +| Signal | Metric names (Istio/Envoy) | Label cardinality note | +|--------|---------------------------|----------------------| +| Request rate (throughput) | `istio_requests_total` | High: source × destination × method × status code | +| Error rate | `istio_requests_total{response_code=~"5.."}` | Subset of throughput labels | +| Latency p50/p95/p99 | `istio_request_duration_milliseconds` | Histogram; disable per-endpoint if >1000 pods | +| Saturation (pending) | `envoy_cluster_upstream_rq_pending_total` | Per cluster | +| mTLS handshake errors | `istio_tcp_connections_closed_total` | Proxy-level | + +Istio 1.22+ Telemetry API disables `destination_service_name` by default at high pod counts to cap cardinality. Override only after confirming cardinality budget in `resources/meta-observability.md`. + +### 3.2 Distributed Traces + +Envoy automatically creates **ingress and egress spans** for every HTTP/gRPC call that traverses the proxy. Each span captures: + +- `http.method`, `http.status_code`, `http.url` (Stable semconv) +- `peer.service` derived from Envoy cluster name +- mTLS peer identity as `net.peer.name` (when available) +- `x-request-id` correlation ID (Istio-specific; carries across internal calls) + +Spans are created without SDK changes. The caveat is that without application-level header forwarding, each service starts a **new root span** rather than contributing to the existing trace. This is the context propagation problem addressed in Section 5. + +### 3.3 Access Logs + +Envoy access logs provide per-request metadata: source workload identity, destination service, HTTP method and status code, request duration, bytes sent and received, and upstream cluster name. Access logs complement traces — they are always available (100% sampling) even when traces are sampled at 1%. + +Structured access log format (JSON) is required for trace correlation: + +```json +{ + "trace_id": "%TRACE_ID%", + "span_id": "%SPAN_ID%", + "upstream_cluster": "%UPSTREAM_CLUSTER%", + "response_code": "%RESPONSE_CODE%", + "duration": "%DURATION%", + "source_principal": "%DOWNSTREAM_PEER_SUBJECT%" +} +``` + +> Envoy command operators `%TRACE_ID%` and `%SPAN_ID%` (available since Envoy 1.25) extract the W3C `traceparent` context Envoy itself emitted. Using `%REQ(X-B3-TRACEID)%` is only correct if the upstream explicitly sent the B3 header. Since this skill standardizes on W3C (see §6 Propagator Headers), prefer the command operators. + +### 3.4 mTLS Certificate Observability + +Istio exposes certificate expiry and rotation events as Prometheus metrics: + +- `citadel_server_csr_count` — certificate signing request volume +- `pilot_xds_pushes{type="sds"}` — SDS secret delivery (certificate rotation) +- Cert expiry: scraped from `istio-proxy` via `/pki` endpoint, alertable as a PrometheusRule + +Cross-reference: `signals/privacy.md §Security Context (TLS attrs Development)` for attribute stability notes. + +--- + +## 4. OpenTelemetry Direct Export (Envoy 1.29+ / Istio 1.22+) + +### 4.1 Envoy OTLP Tracer + +Since Envoy 1.29, the built-in OTLP tracer exports spans directly over OTLP/HTTP to any OTel-compatible backend or OTel Collector, replacing the older Zipkin and Jaeger tracers. + +Istio 1.22 surfaces this via the **Telemetry API** `opentelemetry` provider type, which avoids editing raw Envoy bootstrap config. + +### 4.2 Telemetry CR (Istio 1.22+) + +```yaml +# Configure Istio tracing to export OTLP/HTTP to an OTel Collector +apiVersion: telemetry.istio.io/v1 +kind: Telemetry +metadata: + name: otel-tracing + namespace: istio-system +spec: + tracing: + - providers: + - name: otel-tracing-provider + randomSamplingPercentage: 10.0 +--- +# Register the OTel tracing provider in MeshConfig +# (set via IstioOperator or helm values: meshConfig.extensionProviders) +# meshConfig: +# extensionProviders: +# - name: otel-tracing-provider +# opentelemetry: +# service: otel-collector.observability.svc.cluster.local +# port: 4318 +# resourceDetectors: +# environment: {} +``` + +The `resourceDetectors.environment` block activates the **Environment Resource Detector**, which reads `OTEL_RESOURCE_ATTRIBUTES` from the proxy environment and enriches every span with `service.name`, `service.namespace`, `k8s.pod.name`, and `k8s.node.name` (Stable semconv) automatically. + +### 4.3 Custom Samplers (Envoy 1.29+) + +Envoy 1.29 introduced the OTel Sampler interface, allowing parent-based or trace-ID-ratio samplers to be configured without patching application code. Tail-based sampling decisions are still made at the Collector tier — see `resources/transport/sampling-recipes.md` for tail-sampler configuration. + +--- + +## 5. Zero-Code Auto-Instrumentation Pitfall: Broken Context Propagation + +### 5.1 The Problem + +The mesh sidecar creates spans but cannot inject `traceparent` into the application's outbound HTTP calls by itself. If the application does not read the incoming `traceparent` header and forward it on outbound calls, Envoy on the egress side starts a **new trace**, breaking the trace chain. The result is a trace forest of disconnected single-hop spans rather than a unified distributed trace. + +This is anti-pattern #1 for mesh observability: mesh-only tracing without application header forwarding. + +### 5.2 Solution A: Application-Level Propagator Library + +Add the W3C Trace Context propagator to the application SDK. The SDK reads `traceparent` from incoming requests and automatically injects it into all outbound HTTP/gRPC calls. + +```python +# Python example (opentelemetry-sdk) +from opentelemetry.propagate import set_global_textmap +from opentelemetry.propagators.composite import CompositePropagator +from opentelemetry.propagators.b3 import B3MultiFormat +from opentelemetry import propagate + +set_global_textmap(CompositePropagator([ + propagate.get_global_textmap(), # W3C TraceContext + Baggage + B3MultiFormat(), # B3 for Zipkin-legacy services +])) +``` + +### 5.3 Solution B: OTel Operator Instrumentation CR + +The **OpenTelemetry Operator** (`github.com/open-telemetry/opentelemetry-operator`) injects the OTel SDK init container and environment variables into Pods automatically via a mutating webhook, without any application code change. + +Supported language runtimes: Java, NodeJS, Python, .NET, Go, Apache HTTPD, Nginx. + +**Step 1 — Deploy the Instrumentation CR:** + +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: default-instrumentation + namespace: my-app +spec: + exporter: + endpoint: http://otel-collector.observability.svc.cluster.local:4318 + propagators: + - tracecontext # W3C traceparent / tracestate + - baggage # W3C baggage + - b3 # B3 single-header (Zipkin legacy compatibility) + sampler: + type: parentbased_traceidratio + argument: "0.1" + java: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java:latest + nodejs: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-nodejs:latest + python: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python:latest + dotnet: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-dotnet:latest + go: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go:latest +``` + +**Step 2 — Activate per Pod via annotation:** + +```yaml +metadata: + annotations: + instrumentation.opentelemetry.io/inject-java: "true" + # or: inject-nodejs, inject-python, inject-dotnet, inject-go +``` + +The Operator injects the SDK agent, sets `OTEL_EXPORTER_OTLP_ENDPOINT`, and configures the propagator list. The sidecar then sees correctly formed `traceparent` headers on all outbound calls, connecting mesh spans to application spans in a single trace. + +--- + +## 6. Propagator Headers per Mesh + +Cross-reference: `resources/boundaries/cross-application.md` for full propagator compatibility matrix. + +| Header | Standard / Origin | Carried by | Notes | +|--------|------------------|-----------|-------| +| `traceparent` | W3C Trace Context L1 | W3C-compliant SDKs, Envoy OTLP, Istio | Recommended default for all new deployments | +| `tracestate` | W3C Trace Context L1 | Same as above | Vendor-specific trace flags; opaque to intermediaries | +| `baggage` | W3C Baggage | OTel SDK, Istio | Key-value propagation; strip PII at ingress gateway | +| `x-b3-traceid` | B3 (Zipkin) | Envoy B3 propagator, Linkerd (legacy compat) | 128-bit; use B3 multi-header format | +| `x-b3-spanid` | B3 (Zipkin) | Same | 64-bit | +| `x-b3-parentspanid` | B3 (Zipkin) | Same | 64-bit; absent on root spans | +| `x-b3-sampled` | B3 (Zipkin) | Same | `1` = keep, `0` = drop; overrides downstream sampler | +| `x-request-id` | Envoy / Istio internal | Istio sidecar injected | Stable correlation ID across retries; not a trace ID | +| `x-ot-span-context` | OpenTracing (legacy) | Envoy OpenTracing tracer | Deprecated; replaced by OTLP tracer in Envoy 1.29+ | +| `l5d-ctx-trace` | Linkerd | linkerd2-proxy | Linkerd trace context; not W3C-compatible | +| `l5d-ctx-span` | Linkerd | linkerd2-proxy | Current span identifier in Linkerd format | +| `l5d-ctx-parent` | Linkerd | linkerd2-proxy | Parent span reference | +| `l5d-ctx-deadline` | Linkerd | linkerd2-proxy | Deadline propagation (timeout budget) | + +**Cross-mesh compatibility rule:** when Istio and Linkerd coexist (or when a service exits the mesh boundary to an external system), standardize on W3C `traceparent` / `tracestate` as the translation surface. Configure each mesh's propagator list to include W3C as the first entry. Translate Linkerd `l5d-ctx-*` headers to W3C at the mesh boundary gateway to avoid split-brain traces. + +--- + +## 7. Combining Mesh and Application Telemetry + +The mesh alone covers **network boundary spans**: the ingress proxy creates a server-side span, the egress proxy creates a client-side span. What it cannot see is what the application does between receiving a request and making the next outbound call — database queries, cache reads, external API calls, or business-logic operations. + +| Source | Spans covered | Business context | +|--------|--------------|-----------------| +| Mesh (Envoy/Linkerd) | Ingress and egress HTTP/gRPC per service hop | None — proxy has no access to app state | +| Application SDK | Internal operations: DB, cache, queue, business logic | Full — SDK operates inside the application process | +| Combined via OTel Collector | All of the above, correlated by `trace_id` | Complete — best result | + +Recommended pipeline architecture: + +``` +Envoy sidecar + └─ OTLP/HTTP → OTel Collector (DaemonSet or sidecar) + └─ traces → observability backend + +Application SDK (via OTel Operator Instrumentation CR) + └─ OTLP/gRPC → OTel Collector (same or separate instance) + └─ traces → observability backend (same trace_id) +``` + +Both pipelines feed the same backend. The backend joins spans by `trace_id`. The result is a waterfall chart with proxy-level boundary spans and application-level internal spans in the same view. + +For Collector topology options see `resources/transport/collector-topology.md`. + +--- + +## 8. mTLS Observability (Security Context) + +When Istio PeerAuthentication is set to `STRICT` mode, all pod-to-pod traffic is encrypted with mTLS. The mesh then exposes security context as observable attributes: + +- **TLS version**: `tls.protocol_version` (Development tier per semconv 1.27.0 — see `resources/standards.md §Semconv Stability Tiers`) +- **Cipher suite**: `tls.cipher` (Development tier) +- **Certificate expiry**: alertable via PrometheusRule on `citadel_server_root_cert_expiry_timestamp` +- **SPIFFE peer identity**: available in Envoy access log via `%DOWNSTREAM_PEER_SUBJECT%` — identifies source workload for zero-trust audit + +```yaml +# PrometheusRule: alert when any Istio cert expires within 7 days +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: istio-cert-expiry + namespace: istio-system +spec: + groups: + - name: istio.certificates + rules: + - alert: IstioCertExpiryWarning + expr: | + (citadel_server_root_cert_expiry_timestamp - time()) / 86400 < 7 + for: 1h + labels: + severity: warning + annotations: + summary: "Istio root certificate expires in fewer than 7 days" +``` + +Cross-reference: `resources/signals/privacy.md §Security Context` for TLS attribute stability notes. + +--- + +## 9. Sampling Considerations + +Istio's default sampling rate is **1%** (`randomSamplingPercentage: 1.0`). At 1%, you need 100 requests before the first trace is visible in the backend — this is insufficient for debugging low-traffic endpoints or canary deployments. + +| Scenario | Recommended sampling rate | Configuration location | +|----------|--------------------------|----------------------| +| Development / staging | 100% | Telemetry CR `randomSamplingPercentage: 100` | +| Production (baseline) | 10% | Telemetry CR `randomSamplingPercentage: 10` | +| High-traffic SLO critical path | Tail-based at Collector | Collector `tail_sampling` processor | +| Canary release tracing | 100% on canary subset | Telemetry CR scoped to canary namespace or label | + +Tune via the Telemetry API `samplingPercentage` field (Istio 1.22+) or by configuring a custom OTel Sampler via Envoy 1.29+. For tail-based sampling (retain error traces, drop successful traces after SLO window), configure at the gateway Collector tier as documented in `resources/transport/sampling-recipes.md`. + +--- + +## 10. Matrix Coverage Reference (mesh row) + +These cells from `resources/matrix.md` are the primary coverage drivers for this file: + +| matrix cell | symbol | artifact | +|------------|--------|---------| +| mesh × cross-application × traces | ✅ | Zero-code L7 trace continuity — primary use case of the mesh layer | +| mesh × multi-tenant × traces | ✅ | `tenant.id` via W3C Baggage; proxy can enforce baggage scrubbing at gateway | +| mesh × release × traces | ⚠️ | Canary routing rules are observable; `service.version` on spans; trace continuity requires OTel Operator CR | +| mesh × privacy × * | ✅ | mTLS config observability; baggage scrubbing; SPIFFE identity in access log | +| mesh × slo × metrics | ✅ | Envoy request rate + error rate are primary SLI sources | + +--- + +## 11. Anti-Patterns + +The following are anti-patterns for this layer. They are candidates for inclusion in `resources/anti-patterns.md`. + +| # | Anti-pattern | Impact | Remedy | +|---|-------------|--------|--------| +| AP-M1 | Mesh-only tracing without application SDK | Spans cover network hops only; DB queries, business logic, and external API calls invisible | Deploy OTel Operator Instrumentation CR to inject SDK into all application pods | +| AP-M2 | W3C Trace Context not standardized across meshes | Linkerd and Istio produce disconnected trace forests when a request crosses mesh boundaries | Configure W3C as first propagator in both meshes; translate at boundary gateway | +| AP-M3 | Default 1% Istio sampling rate in production without awareness | First trace visible only after 100 requests; low-traffic endpoints never appear in trace backend | Raise to 10% baseline; use tail-based sampling at Collector for cost control | +| AP-M4 | OTel Operator Instrumentation CR not deployed | `traceparent` not forwarded by application code; mesh spans are disconnected root spans | Deploy `Instrumentation` CR and annotate all application Deployments | +| AP-M5 | Cardinality explosion from per-endpoint Envoy labels | `istio_requests_total` with high-cardinality `destination_service_name` × `source_workload` × URL path exceeds Prometheus cardinality budget | Use Telemetry API to suppress high-cardinality labels; aggregate at Collector metric transform processor | + +--- + +## 12. References + + +Internal cross-references: +- `resources/standards.md` — normative semconv stability tiers and W3C Trace Context requirements +- `resources/matrix.md` — full 112-cell coverage map (mesh row) +- `resources/transport/sampling-recipes.md` — tail-based sampling at Collector tier +- `resources/transport/collector-topology.md` — DaemonSet vs sidecar Collector topology +- `resources/meta-observability.md` — cardinality guardrails and pipeline self-health +- `resources/layers/L4-transport.md` — eBPF socket-level profiles for mesh sidecar overhead +- `resources/boundaries/cross-application.md` — full propagator compatibility matrix +- `resources/boundaries/multi-tenant.md` — baggage-based tenant attribution +- `resources/boundaries/release.md` — canary trace routing with `service.version` +- `resources/signals/privacy.md` — TLS attribute stability and mTLS security context +- `resources/signals/traces.md` — OTel SDK trace patterns for application layer + +## References + +- Istio observability concepts: <https://istio.io/latest/docs/concepts/observability/> +- Istio OpenTelemetry tracing task: <https://istio.io/latest/docs/tasks/observability/distributed-tracing/opentelemetry/> +- Envoy + Istio OTel features (2024): <https://opentelemetry.io/blog/2024/new-otel-features-envoy-istio/> +- OTel Operator GitHub: <https://github.com/open-telemetry/opentelemetry-operator> +- Linkerd observability and propagator docs: <https://linkerd.io/2.15/features/distributed-tracing/> +- W3C Trace Context L1: <https://www.w3.org/TR/trace-context/> +- OTel semconv `tls.*`: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/tls/> diff --git a/.agents/skills/oma-observability/resources/matrix.md b/.agents/skills/oma-observability/resources/matrix.md new file mode 100644 index 0000000..ed26963 --- /dev/null +++ b/.agents/skills/oma-observability/resources/matrix.md @@ -0,0 +1,179 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# Observability Coverage Matrix + +## 1. Purpose + +This file is the coverage map for the `oma-observability` skill. It answers one question per cell: + +> "What observability artifact belongs at the intersection of this layer, this boundary, and this signal?" + +The matrix has three goals: + +1. **Prevent hand-wave instrumentation.** "Just add metrics" is not a plan. Every cell forces an explicit decision: what artifact, from which source, tagged how, stored where. +2. **Surface N/A early.** Some combinations are physically or semantically meaningless (e.g., L3-level CPU profiling). Declaring them N/A is a correct engineering answer, not a gap. +3. **Provide navigation.** Each covered cell links to the authoritative layer/boundary/signal file within this skill so implementers can jump directly to the detailed guidance. + +**Normative base:** all taxonomy, semconv stability tiers, and OSI boundary decisions are declared in `resources/standards.md`. Read that file before using this matrix. + +--- + +## 2. Taxonomy + +### Layers (4) + +| Label | Meaning | +|-------|---------| +| L3-network | IP routing, VPC flow logs, BGP/BMP, ICMP, PMTUD | +| L4-transport | TCP retransmits/RTT, QUIC/HTTP3 transport, eBPF (Beyla/Pixie) | +| mesh | Istio/Linkerd/Envoy: zero-code instrumentation, mTLS metadata as security context | +| L7-application | HTTP/gRPC/WebSocket, RUM (web + mobile), crash analytics, messaging | + +### Boundaries (4) + +| Label | Meaning | +|-------|---------| +| multi-tenant | Signals must be isolated, attributed, and possibly residency-constrained per tenant | +| cross-application | Propagation across service, domain, or vendor boundaries | +| slo | Signals that feed SLI computation, burn-rate alerting, or error-budget accounting | +| release | Signals that are correlated to a specific deployment, canary, or feature flag state | + +### Signals (7) + +| Label | Meaning | +|-------|---------| +| metrics | Numeric time-series (counters, gauges, histograms) | +| logs | Structured log records and events | +| traces | Distributed spans and context propagation | +| profiles | Continuous CPU/memory/off-CPU flame graphs (OTEP 0239 alpha) | +| cost | FinOps unit-economics and resource attribution | +| audit | Immutable, tamper-evident event records for compliance | +| privacy | PII detection, redaction, anonymization, pseudonymization controls | + +--- + +## 3. How to Read the Matrix + +- **Row** = one boundary within a given layer. +- **Column** = one signal. +- **Cell content** = 1-2 word artifact description + file reference. + +### Cell legend + +| Symbol | Meaning | +|--------|---------| +| ✅ | Covered: artifact is well-defined; see referenced file and section | +| ⚠️ | Partially covered or requires caveat; see referenced file and "Caveats" section below | +| ❌ | N/A — combination is not meaningful at this layer/boundary or produces no actionable artifact | + +Every ✅ and ⚠️ cell includes at least one file reference. File references point to files within this skill tree. Files not yet written (Phase 1b/1c) are forward references and are explicitly allowed. + +--- + +## 4. The Matrix + +### Layer: L3-network + +L3 is the IP routing layer: VPC flow logs, BGP/BMP, ICMP unreachables, and PMTUD probing. Observability at this layer is network-flow-centric. Traces are not native to L3 packets; profiles are not applicable. SLO and release boundaries are defined at the application layer and cannot be computed from IP flow data alone. + +| Boundary \ Signal | metrics | logs | traces | profiles | cost | audit | privacy | +|---|---|---|---|---|---|---|---| +| multi-tenant | ✅ per-tenant VPC flow byte/packet counters → `layers/L3-network.md`, `boundaries/multi-tenant.md §Metric attribution` | ✅ VPC flow log stream tagged by tenant network CIDR → `layers/L3-network.md`, `signals/logs.md` | ⚠️ trace-ID egress tagging at L3 boundary only; trace context not carried in IP headers → `layers/L3-network.md §BGP advanced`, `signals/traces.md` | ❌ N/A — CPU/memory profiling has no L3 artifact | ⚠️ egress byte attribution by tenant VPC → `signals/cost.md §egress`, `boundaries/multi-tenant.md` | ✅ VPC flow audit trail tagged by tenant → `signals/audit.md`, `layers/L3-network.md` | ⚠️ IP addresses are PII in GDPR/PIPA; mask or hash before retention → `signals/privacy.md §IP addresses` | +| cross-application | ✅ inter-VPC/peering flow metrics → `layers/L3-network.md`, `signals/metrics.md` | ✅ VPC flow logs across peering or transit gateway → `layers/L3-network.md`, `signals/logs.md` | ⚠️ L3 packets carry no trace context natively; use trace-ID in DNS or SNI side-channel only → `layers/L3-network.md`, `boundaries/cross-application.md` | ❌ N/A | ⚠️ cross-VPC egress cost attribution; unreliable without flow tagging → `signals/cost.md` | ✅ inter-VPC flow audit for SOC2 network controls → `signals/audit.md`, `layers/L3-network.md` | ⚠️ source/destination IPs crossing application boundary are PII candidates → `signals/privacy.md §IP addresses` | +| slo | ❌ N/A — SLO error budgets are defined at L7; L3 availability feeds infra health at most | ❌ N/A — VPC flow logs are operational, not SLO inputs | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | +| release | ❌ N/A — release events carry no L3 signal | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | + +--- + +### Layer: L4-transport + +L4 is the TCP/UDP/QUIC transport layer. eBPF (Beyla, Pixie) is the primary observability mechanism, producing socket-level metrics and off-CPU flame graphs without code changes. TCP retransmits, RTT, and connection states are rich signals. Trace context is not native to TCP; it lives at the application framing layer. SLO and release boundaries are application concerns. + +| Boundary \ Signal | metrics | logs | traces | profiles | cost | audit | privacy | +|---|---|---|---|---|---|---|---| +| multi-tenant | ✅ per-tenant TCP retransmit rate, RTT histograms via eBPF socket filter → `layers/L4-transport.md §eBPF`, `boundaries/multi-tenant.md` | ✅ TCP connection lifecycle events per tenant socket namespace → `layers/L4-transport.md`, `signals/logs.md` | ⚠️ TCP is not trace-native; mesh or L7 must carry trace context; L4 can log socket tuples for correlation → `layers/L4-transport.md`, `signals/traces.md` | ✅ eBPF CPU/off-CPU profiling at socket-level covers L4 overhead; pprof-compatible output → `signals/profiles.md`, `layers/L4-transport.md §eBPF` | ⚠️ L4 byte volume per tenant as cost proxy; not a direct cost signal → `signals/cost.md` | ⚠️ connection-level audit (who connected to what port, when) — useful for SOC2 network access controls → `signals/audit.md`, `layers/L4-transport.md` | ⚠️ source IPs in TCP metadata are PII; apply same IP-masking rules as L3 → `signals/privacy.md §IP addresses` | +| cross-application | ✅ cross-service TCP RTT and retransmit metrics via eBPF → `layers/L4-transport.md §eBPF`, `signals/metrics.md` | ✅ TCP connection events across service socket pairs → `layers/L4-transport.md`, `signals/logs.md` | ⚠️ trace context does not exist at L4; identify cross-app flows by socket 5-tuple and correlate to L7 spans → `layers/L4-transport.md`, `boundaries/cross-application.md` | ✅ eBPF off-CPU profiles covering network-wait time across application boundaries → `signals/profiles.md`, `layers/L4-transport.md §eBPF` | ⚠️ cross-application byte volume as FinOps signal; correlate with L7 cost attribution for accuracy → `signals/cost.md` | ✅ TCP connection audit across trust boundaries → `signals/audit.md`, `layers/L4-transport.md` | ⚠️ connection metadata contains IPs; apply masking at pipeline ingress → `signals/privacy.md` | +| slo | ❌ N/A — SLO windows are application-defined; L4 connection success rate may inform infra SLI but is not a canonical SLO boundary | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | +| release | ❌ N/A — release events are application-layer; L4 has no deployment semantic | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | ❌ N/A | + +--- + +### Layer: mesh + +The service mesh layer (Istio, Linkerd, Envoy) operates as a transparent L4-L7 proxy sidecar or ambient mode agent. It provides zero-code instrumentation, mTLS metadata as security context, and W3C Trace Context propagation without application code changes. This layer has the strongest native multi-signal coverage of any layer in this skill. + +| Boundary \ Signal | metrics | logs | traces | profiles | cost | audit | privacy | +|---|---|---|---|---|---|---|---| +| multi-tenant | ✅ per-tenant RED (Rate/Error/Duration) metrics from Envoy telemetry; tagged via `tenant.id` baggage → `layers/mesh.md`, `boundaries/multi-tenant.md §Metric attribution`, `signals/metrics.md` | ✅ Envoy access logs per tenant with baggage-derived tenant tag → `layers/mesh.md`, `signals/logs.md` | ✅ Envoy zero-code span injection + W3C Baggage `tenant.id` propagation → `layers/mesh.md`, `boundaries/multi-tenant.md`, `signals/traces.md` | ⚠️ mesh proxies add latency overhead visible in eBPF profiles; not a mesh-native profiling source → `signals/profiles.md`, `layers/mesh.md` | ⚠️ request-level cost attribution by tenant via mesh telemetry; feed into OpenCost → `signals/cost.md`, `layers/mesh.md` | ✅ mTLS identity + access log provide SOC2 accountability trail per tenant → `signals/audit.md`, `layers/mesh.md` | ✅ mTLS config enforces transport encryption; baggage scrubbing at ingress gateway removes PII before propagation → `signals/privacy.md`, `layers/mesh.md`, `resources/standards.md §W3C Baggage` | +| cross-application | ✅ cross-service RED metrics at mesh proxy; primary use case for service topology mapping → `layers/mesh.md`, `boundaries/cross-application.md`, `signals/metrics.md` | ✅ Envoy access logs across service-to-service calls; correlation by `trace_id` → `layers/mesh.md`, `signals/logs.md` | ✅ primary trace origin for cross-application spans; Envoy injects spans without code changes; W3C Trace Context propagation → `layers/mesh.md`, `boundaries/cross-application.md`, `signals/traces.md` | ⚠️ mesh overhead visible via eBPF off-CPU profiles on sidecar process; not mesh-native → `signals/profiles.md`, `layers/L4-transport.md §eBPF` | ✅ request cost attribution across services; mesh provides per-service byte/request counts → `signals/cost.md`, `layers/mesh.md` | ✅ mTLS peer identity in access log provides cross-application accountability → `signals/audit.md`, `layers/mesh.md` | ✅ baggage trust boundary enforced at mesh ingress gateway; strip or validate external baggage → `signals/privacy.md`, `boundaries/cross-application.md`, `resources/standards.md §W3C Baggage` | +| slo | ✅ mesh request rate and error rate are primary SLI sources for latency and availability SLOs → `layers/mesh.md`, `boundaries/slo.md`, `signals/metrics.md §SLI` | ⚠️ Envoy access logs as burn-rate event source; log-based SLI possible but metric-based preferred → `signals/logs.md`, `boundaries/slo.md` | ⚠️ trace sampling must be configured to retain error traces for SLO error-budget forensics; not a direct SLO input → `signals/traces.md`, `boundaries/slo.md` | ❌ N/A — profiles do not feed SLO computation | ❌ N/A — cost is a separate FinOps concern, not an SLO boundary input | ⚠️ SLO audit trail: record burn-rate threshold breach events as audit log entries → `signals/audit.md`, `boundaries/slo.md` | ❌ N/A | +| release | ✅ mesh request split metrics for canary traffic (Flagger/Argo Rollouts proxy rules) → `layers/mesh.md`, `boundaries/release.md`, `signals/metrics.md` | ✅ deployment event injected as Envoy log entry with `service.version` tag → `layers/mesh.md`, `signals/logs.md`, `boundaries/release.md` | ⚠️ canary proxy rules at mesh layer route a percentage of traces to new version; trace header carries `service.version` → `layers/mesh.md`, `boundaries/release.md`, `signals/traces.md` | ❌ N/A | ⚠️ canary cost delta visible via per-version request counts in mesh telemetry → `signals/cost.md`, `boundaries/release.md` | ✅ release event audit: mesh records version-tagged traffic split events → `signals/audit.md`, `boundaries/release.md` | ❌ N/A | + +--- + +### Layer: L7-application + +L7 is the application layer: HTTP/gRPC, RUM (web + mobile), crash analytics, and messaging. This is the primary signal domain for this skill. All seven signals are meaningful here. Multi-tenant instrumentation is richest at L7 because application code has full access to user context, tenant identifiers, and business semantics. + +| Boundary \ Signal | metrics | logs | traces | profiles | cost | audit | privacy | +|---|---|---|---|---|---|---|---| +| multi-tenant | ✅ per-tenant RED + custom business metrics tagged with `tenant.id`; histogram by tenant tier → `signals/metrics.md §tenant`, `boundaries/multi-tenant.md`, `layers/L7-application/web-rum.md` | ✅ structured log stream with `tenant.id` on every record; tenant log routing via OTel Collector `routing_connector` → `signals/logs.md`, `boundaries/multi-tenant.md` | ✅ W3C Baggage `tenant.id` propagated on every span; trace exported per-tenant collector pipeline if residency required → `signals/traces.md`, `boundaries/multi-tenant.md`, `resources/standards.md §W3C Baggage` | ✅ per-tenant continuous profiling with Parca/Pyroscope; label by `tenant.id` for flame graph isolation → `signals/profiles.md`, `boundaries/multi-tenant.md` | ✅ request-level cost attribution by tenant; feed OpenCost unit economics model → `signals/cost.md §unit economics`, `boundaries/multi-tenant.md` | ✅ per-tenant audit event stream; WORM storage per tenant for SOC2 evidence → `signals/audit.md`, `boundaries/multi-tenant.md` | ✅ tenant-scoped PII redaction; per-tenant privacy config for GDPR right-to-erasure → `signals/privacy.md`, `boundaries/multi-tenant.md` | +| cross-application | ✅ inter-service request rate and latency histograms; use `service.name` + `peer.service` labels → `signals/metrics.md`, `boundaries/cross-application.md` | ✅ correlation log: `trace_id` + `span_id` on every log record enables log-trace join across applications → `signals/logs.md`, `boundaries/cross-application.md` | ✅ primary trace origin; W3C Trace Context `traceparent` on all outbound calls; DDD namespace baggage for bounded-context attribution → `signals/traces.md`, `boundaries/cross-application.md`, `resources/standards.md §W3C Trace Context` | ✅ application-level profiling showing cross-service call overhead; correlate with traces via `trace_id` label → `signals/profiles.md`, `boundaries/cross-application.md` | ✅ request cost attribution across services; per-service unit cost model → `signals/cost.md`, `boundaries/cross-application.md` | ✅ cross-application audit events carry caller identity and `trace_id` for accountability chain → `signals/audit.md`, `boundaries/cross-application.md` | ✅ baggage trust boundary at API gateway; validate or strip PII-bearing baggage from external callers → `signals/privacy.md`, `boundaries/cross-application.md`, `resources/standards.md §W3C Baggage` | +| slo | ✅ SLI metric computation (availability, latency p99, error rate); SLO targets defined in OpenSLO YAML → `signals/metrics.md §SLI`, `boundaries/slo.md`, `resources/observability-as-code.md` | ⚠️ log-based SLI possible (error log count / total); valid for non-metrics-instrumented services but metric-based SLI preferred → `signals/logs.md`, `boundaries/slo.md` | ⚠️ critical path trace sampling for SLO forensics; tail-sampler keeps error traces within error budget window → `signals/traces.md`, `boundaries/slo.md`, `transport/sampling-recipes.md` | ❌ N/A — profiling does not feed SLO computation directly | ❌ N/A — cost SLO is a FinOps budget concern, not an error-budget SLO | ⚠️ SLO breach audit record: persist burn-rate threshold crossing as immutable audit event → `signals/audit.md`, `boundaries/slo.md` | ❌ N/A | +| release | ✅ release marker metric event; `service.version` label on all metrics for before/after comparison → `signals/metrics.md`, `boundaries/release.md`, `resources/observability-as-code.md` | ✅ deployment event as structured log record; deployment SHA, version, and rollout strategy logged → `signals/logs.md`, `boundaries/release.md` | ✅ `service.version` attribute on all spans; canary trace routing by version tag for A/B error comparison → `signals/traces.md`, `boundaries/release.md` | ⚠️ pre/post-release profile comparison for performance regression detection; Parca/Pyroscope diff view → `signals/profiles.md`, `boundaries/release.md` | ⚠️ release cost delta: compare per-request cost across canary vs stable version → `signals/cost.md`, `boundaries/release.md` | ✅ release audit event: immutable record of who deployed what version when → `signals/audit.md`, `boundaries/release.md` | ❌ N/A | + +--- + +## 5. Caveats + +Rationale for the rarer ❌ / ⚠️ cells. Each entry justifies the marker and points to the authoritative file. + +- **C1. L3 × {SLO, release}** — N/A. SLO and release are L7 constructs (OpenSLO YAML, canary markers); L3 IP flow feeds infra health, not application error budgets. An L3 event (BGP leak, PMTUD black hole) that causes an SLO burn surfaces as an L7 error spike first — investigate via `resources/incident-forensics.md` 6-dim localization, not via L3 SLO policy. See `boundaries/slo.md`, `layers/L3-network.md`. +- **C2. L4 × {SLO, release}** — same rationale as C1. TCP connection success rate can serve as a fallback SLI when L7 instrumentation is absent, but it is not recommended. See `boundaries/slo.md §fallback SLI sources`. +- **C3. {L3, L4} × profiles** — profiling (Parca, Pyroscope, OTEP 0239) is process-level; L3 has no equivalent artifact. L4 × profiles ✅ reflects eBPF kernel-socket and off-CPU wait measurements — the closest meaningful artifact — attributed to `layers/L4-transport.md §eBPF`. +- **C4. {L3, L4} × traces (⚠️)** — W3C Trace Context lives in HTTP/gRPC headers, not IP/TCP. The ⚠️ reflects a correlation technique (log socket 5-tuple alongside trace ID) rather than native propagation. Native propagation begins at mesh or L7. See `resources/standards.md §W3C Trace Context`. +- **C5. mesh × profiles (⚠️)** — sidecar proxies (Envoy, Linkerd-proxy) are separate processes; their CPU overhead is visible via eBPF on the sidecar, but that is an L4 artifact, not a mesh-native profiling signal. Mesh exposes no profiling API. See `signals/profiles.md`, `layers/L4-transport.md §eBPF`. +- **C6. {L3, L4} × cost (⚠️)** — byte volume is a cost proxy sufficient for cloud egress billing but insufficient for unit economics, which requires L7 per-request attribution. See `signals/cost.md §egress attribution`. +- **C7. {L3, L4} × privacy (⚠️)** — IP addresses are personal data (GDPR Art. 4(1), PIPA equivalent). VPC flow and TCP connection logs are operationally required but must be masked, hashed, or pseudonymized before long-term retention. Masking technique (prefix truncation, HMAC+salt, pseudonymization) is in `signals/privacy.md §IP addresses`. +- **C8. {L3, L4} × {SLO, release} × audit** — audit requires an identity-bearing, time-bound subject; IP/TCP-layer events carry no such subject tied to an SLO policy or deployment action. Audit at SLO/release boundaries is always L7-originated. See `signals/audit.md`. + +--- + +## 6. Cell Count Verification + +| Layer | Boundaries | Signals | Cells | +|-------|-----------|---------|-------| +| L3-network | 4 | 7 | 28 | +| L4-transport | 4 | 7 | 28 | +| mesh | 4 | 7 | 28 | +| L7-application | 4 | 7 | 28 | +| **Total** | | | **112** | + +All 112 cells are populated. No cell is blank. + +--- + +## 7. Cross-references + +All files referenced in this matrix belong to the `oma-observability` skill tree. Files not yet written (Phase 1b/1c per the design document rollout plan) are forward references. + +| Category | Files | +|----------|-------| +| Layers | `layers/L3-network.md`, `layers/L4-transport.md`, `layers/mesh.md`, `layers/L7-application/web-rum.md`, `layers/L7-application/mobile-rum.md`, `layers/L7-application/crash-analytics.md` | +| Boundaries | `boundaries/multi-tenant.md`, `boundaries/cross-application.md`, `boundaries/slo.md`, `boundaries/release.md` | +| Signals | `signals/metrics.md`, `signals/logs.md`, `signals/traces.md`, `signals/profiles.md`, `signals/cost.md`, `signals/audit.md`, `signals/privacy.md` | +| Transport | `transport/sampling-recipes.md`, `transport/collector-topology.md` | +| Resources | `resources/standards.md`, `resources/incident-forensics.md`, `resources/observability-as-code.md`, `resources/meta-observability.md` | + +--- + +## 8. Review and Maintenance + +- **Review cadence**: quarterly, aligned with `resources/standards.md` version update cadence. +- **On semconv promotion** (Development → RC → Stable): re-evaluate ⚠️ cells that cite stability as a caveat; promote to ✅ if the semconv group is now stable. +- **On new layer or boundary addition**: this file requires a new table section and all cross-references updated. Signal columns do not change without a taxonomy revision in the design document. +- **On N/A re-evaluation**: add a Caveat entry (Section 5) explaining why a previously N/A combination is now meaningful before changing the cell marker. +- **Owner**: CTO direct review required for any change to this file (see design document Ownership & Quality Gates table). diff --git a/.agents/skills/oma-observability/resources/meta-observability.md b/.agents/skills/oma-observability/resources/meta-observability.md new file mode 100644 index 0000000..13274cb --- /dev/null +++ b/.agents/skills/oma-observability/resources/meta-observability.md @@ -0,0 +1,606 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +--- + +# Meta-Observability + +## Why Meta-Observability + +"Observing the observer" — if the OTel Collector is silently dropping 10% of traces, every SLO +dashboard, every alerting rule, and every incident forensics query is built on incomplete data. +You will not know the pipeline is degraded unless you instrument the pipeline itself. + +Other files in this skill assume a reliable telemetry pipeline. This file forces you to verify +that assumption. Four disciplines close the gap: + +| Discipline | Risk if ignored | Section | +|---|---|---| +| Pipeline self-health | Silent data loss, metric gaps, memory crashes | A | +| Clock skew / NTP | Waterfall charts lie; phantom race conditions chased | B | +| Cardinality guardrails | TSDB storage explosion, query latency, vendor bill spikes | C | +| Retention matrix | Compliance violations, over-spend on raw data, data unavailable at audit | D | + +Cross-cutting failure modes and recovery paths are in Section E. Alert/dashboard scaffolding +that feeds `resources/observability-as-code.md` is in Section F. + +--- + +## Section A: Pipeline Self-Health + +### A1. OTel Collector Self-Metrics + +The Collector exposes its own telemetry via a Prometheus scrape endpoint (default `:8888/metrics`). +Enable it explicitly: + +```yaml +# otelcol-config.yaml — telemetry block +service: + telemetry: + metrics: + level: detailed # normal | detailed | none + address: 0.0.0.0:8888 + logs: + level: info +``` + +Scrape this endpoint from a separate Prometheus instance (or a second Collector) so that +Collector failures do not destroy their own observability. + +### A2. Key otelcol_* Metrics + +| Metric | What it measures | Alert threshold | +|---|---|---| +| `otelcol_receiver_accepted_spans` | Spans accepted from upstream | Baseline drop > 10% | +| `otelcol_receiver_refused_spans` | Spans refused (parse failure, queue full) | > 0 sustained for 2m | +| `otelcol_exporter_sent_spans` | Spans successfully shipped to backend | Baseline drop > 10% | +| `otelcol_exporter_send_failed_spans` | Export failures (network, auth, rate-limit) | > 1% of sent for 5m | +| `otelcol_processor_queued_retry_send_queue_length` | Backpressure queue depth | > 80% of queue capacity | +| `otelcol_process_runtime_heap_alloc_bytes` | Collector heap usage | > 75% of container limit | +| `otelcol_processor_memory_limiter_refused_spans` | Forced drops due to memory limit | > 0 | +| `otelcol_processor_memory_limiter_refused_metric_points` | Same, for metrics | > 0 | +| `otelcol_processor_memory_limiter_refused_log_records` | Same, for logs | > 0 | + +The same `*_accepted_*`, `*_refused_*`, `*_sent_*`, and `*_send_failed_*` counter families exist +for metric points (`metric_points`) and log records (`log_records`). Apply identical thresholds. + +### A3. Memory Limiter Processor (Required) + +Add `memory_limiter` as the first processor in every pipeline to prevent OOM crashes: + +```yaml +processors: + memory_limiter: + check_interval: 1s + limit_percentage: 75 # hard limit: refuse new data above 75% heap + spike_limit_percentage: 20 # headroom for burst above limit_percentage + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch, ...] + exporters: [otlp] + metrics: + receivers: [otlp, prometheus] + processors: [memory_limiter, batch, ...] + exporters: [prometheusremotewrite] + logs: + receivers: [otlp, fluentforward] + processors: [memory_limiter, batch, ...] + exporters: [loki] +``` + +### A4. Self-Scrape Configuration + +Scrape Collector self-metrics into the same metrics pipeline so they land in the same TSDB +alongside application metrics: + +```yaml +receivers: + prometheus: + config: + scrape_configs: + - job_name: otelcol-self + scrape_interval: 15s + static_configs: + - targets: ["localhost:8888"] + metric_relabel_configs: + - source_labels: [__name__] + regex: otelcol_.* + action: keep +``` + +### A5. Agent Self-Resource Consumption + +| Agent | Typical RAM | Typical CPU | Source | +|---|---|---|---| +| Fluent Bit | 5–15 MB | < 1% (idle), < 5% (burst) | Fluent Bit 2.x benchmarks | +| OTel Collector (DaemonSet) | 30–100 MB | < 5% (moderate load) | VictoriaMetrics 2026 benchmark | +| OTel Collector (gateway) | 100–500 MB | scales with throughput | VictoriaMetrics 2026 benchmark | + +Collect host-level CPU and memory for agent processes via `hostmetrics` receiver: + +```yaml +receivers: + hostmetrics: + collection_interval: 30s + scrapers: + process: + include: + names: ["otelcol", "fluent-bit"] + match_type: regexp + mute_process_name_error: true +``` + +Fluent Bit exposes its own Prometheus metrics at `:2020/api/v1/metrics/prometheus`: + +- `fluentbit_input_records_total` — records ingested per input plugin +- `fluentbit_output_proc_records_total` — records successfully processed per output plugin +- `fluentbit_output_errors_total` — output failures + +### A6. Golden Signal: End-to-End Delivery Ratio + +The single most important pipeline health metric: + +```promql +# Delivery ratio for traces (1.0 = 100% delivered) +sum(rate(otelcol_exporter_sent_spans[5m])) + / +sum(rate(otelcol_receiver_accepted_spans[5m])) +``` + +Alert when this ratio drops below 0.99 for 5 minutes (see Section F, Alert 1). + +--- + +## Section B: Clock Skew & NTP Discipline + +### B1. Why Trace Waterfalls Lie + +Distributed traces use wall-clock timestamps from the node where each span is recorded. +If two nodes have diverging clocks, the waterfall view in any tracing backend will show: + +- A child span appearing to start before its parent span started. +- A child span ending after its parent span ended (`child.end_time > parent.end_time`). +- Negative durations on synthetic computed spans. + +These are not code bugs. They are clock-drift artifacts. Engineers chase phantom race conditions +or assume broken instrumentation, losing hours of incident investigation time. + +Typical NTP accuracy on well-connected cloud VMs: < 50 ms. On baremetal with good NTP: < 10 ms. +PTP (IEEE 1588) achieves sub-millisecond accuracy for financial and telco workloads. + +Reference: `resources/standards.md §Clock Discipline` for the span timestamp validation rule. + +### B2. NTP Requirements + +All host VMs, container hosts, and Kubernetes nodes MUST run a time synchronization daemon: + +- **Linux (systemd)**: `systemd-timesyncd` (lightweight) or `chrony` (recommended for accuracy) +- **Kubernetes**: Node time sync is the Linux host's responsibility. The kubelet and containers + inherit the host clock. The path is: hypervisor NTP → node OS → container runtime → container. + Confirm this chain with your cloud provider's documentation. + +Cloud-provider time sources: + +| Cloud | NTP endpoint | Notes | +|---|---|---| +| AWS | `169.254.169.123` (Amazon Time Sync Service) | PTP-backed, link-local; add to `chrony.conf`; verify with `chronyc sources -v` | +| GCP | `metadata.google.internal` | Internal hypervisor sync. `timedatectl show` only shows daemon state — use `chronyc sources -v` for actual offset | +| Azure | Hyper-V IC timesync primary; `time.windows.com` external NTP is fallback only | Azure Linux VMs with IC tools use host time. External NTP as a parallel peer can conflict; configure as fallback. Docs: <https://learn.microsoft.com/azure/virtual-machines/linux/time-sync> | + +For sub-millisecond requirements (financial trading, high-frequency event processing): +use **PTP (IEEE 1588)** with hardware timestamping. Cloud support varies — AWS supports PTP on +Nitro instances via the Amazon Time Sync Service; verify availability before committing. + +### B3. Span-Level Drift Detection + +Flag spans where the child-ends-after-parent invariant is violated: + +```promql +# PromQL: no native span-level metric; implement as a Collector transform rule +# In otelcol, use the transform processor to emit a counter on violation: +# counter: otelcol_span_clock_violation_total{service="...", direction="child_outlives_parent"} +``` + +OTel Collector `transform` processor example (append to spans pipeline): + +```yaml +processors: + transform/clock_check: + error_mode: ignore + trace_statements: + - context: span + statements: + - set(attributes["clock.violation"], true) + where IsRootSpan() == false + and end_time > parent_end_time # pseudo-field; requires OTTL extension +``` + +Until native OTTL parent-time access is stable, implement this check in your tracing backend's +query layer (e.g., Tempo TraceQL, Jaeger query API) as a periodic job that emits a metric. + +### B4. chrony Offset Metric + +Emit `node.clock.drift_ms` from each host using a cron job or node exporter textfile collector: + +```bash +#!/bin/sh +# /etc/cron.d/chrony-offset — runs every minute +OFFSET=$(chronyc tracking | awk '/System time/ {gsub(/[^0-9.-]/, "", $4); print $4 * 1000}') +echo "node_clock_drift_ms $OFFSET" > /var/lib/node_exporter/textfile/chrony_offset.prom +``` + +Alert rule (see Section F, Alert 3): + +```promql +node_clock_drift_ms > 100 +``` + +--- + +## Section C: Cardinality Guardrails + +### C1. Why Cardinality Matters + +Every unique label combination in a TSDB (Prometheus, VictoriaMetrics, Thanos, Mimir) creates a +separate time series. A metric with 3 labels each having 100 values creates 1,000,000 series. + +Consequences of label explosion: +- TSDB storage grows quadratically with cardinality. +- Query latency increases — each query fans out across more series. +- Vendor bill spikes — SaaS TSDBs charge per active series. +- Ingestion falls behind; scrape intervals are missed. + +### C2. Hard Rules + +| Label | Rule | Reason | +|---|---|---| +| `user.id` | **NEVER** as metric label | Unbounded; one series per user | +| `request.id` | **NEVER** as metric label | One series per request — instant explosion | +| `trace.id` | **NEVER** as metric label | One series per trace | +| `user.email` | **NEVER** as metric label | PII + cardinality double violation | +| `tenant.id` | Cap at top-N (e.g., top-1000); overflow → label value `"other"` | Bounded set of known tenants | +| `endpoint` / `route` | Normalize high-cardinality routes: `/users/42` → `/users/_` | URL parameters are unbounded | +| `http.url` | **NEVER** raw; use `http.route` (normalized) | Query strings are unbounded | +| `error.message` | **NEVER** as label; use error type/code only | Free-text strings are unbounded | + +Implement route normalization in the Collector `transform` processor: + +```yaml +processors: + transform/normalize_routes: + error_mode: ignore + metric_statements: + - context: datapoint + statements: + - replace_pattern(attributes["http.route"], "/[0-9]+", "/_") + - replace_pattern(attributes["http.route"], "/[0-9a-f-]{36}", "/_") # UUID +``` + +### C3. Cardinality Measurement + +Measure active series per metric name in Prometheus-compatible TSDBs: + +```promql +# Count distinct series for a specific metric +count(http_request_duration_seconds_bucket) by (job) + +# Top 10 highest-cardinality metrics in the entire TSDB +topk(10, count({__name__!=""}) by (__name__)) +``` + +VictoriaMetrics exposes `/api/v1/status/tsdb` for cardinality breakdown by metric name +and label value — use this for bulk auditing. + +OTel metric SDK cardinality limit (Development feature as of SDK 1.x): + +```python +# Python SDK example +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.view import View + +view = View( + instrument_name="http_request_duration", + attribute_keys={"http.method", "http.status_code", "http.route"}, # explicit allow-list +) +provider = MeterProvider(views=[view]) +``` + +Explicitly allow-listing attributes is the most effective cardinality control. + +### C4. Series Budget + +Set a per-service series budget. The 5,000-series figure below is an illustrative starting baseline — high-throughput services legitimately operate at 20k-100k series. Calibrate to your service's traffic shape and TSDB ingestion cost. + +```promql +# Alert: service approaching cardinality budget (example baseline: 5000 series/service) +count({job="my-service"}) > 4000 +``` + +Cardinality explosion is an anti-pattern. See `resources/anti-patterns.md §Section B` for +the full cardinality anti-pattern list. + +--- + +## Section D: Retention Matrix + +Retention policy governs how long raw and aggregated data is kept, at what resolution, +and under what storage class. Failing to set explicit retention leads to either: +- Over-spend: keeping full-resolution data for years. +- Compliance violation: flushing audit logs before mandatory retention periods expire. + +### D1. Unified Per-Signal Retention Policy + +| Signal | Raw resolution | Aggregated / downsampled | Archive | +|---|---|---|---| +| Metrics | 15d full-res | 90d @ 5m resolution | 2y @ 1h resolution | +| Logs (operational) | 7d | 30d | 90d | +| Logs (audit — SOC2/ISO 27001) | 90d | 1y | **7y WORM** | +| Traces (sampled, tail-based) | 30d | — | — | +| Traces (full 100% sample) | 3d | — | — | +| Profiles | 14d | — | — | +| Events | 30d | 90d | — | + +### D2. Rationale by Signal + +**Metrics (15d / 90d / 2y)**: Full-resolution metrics are needed for hourly/daily incident +investigation (15d covers most post-incident reviews). Downsampled 5m aggregates cover quarterly +business reviews and SLO trend reporting (90d). 1h aggregates for 2y support year-over-year +capacity planning without raw-data storage cost. + +**Logs (operational, 7d / 30d / 90d)**: Operational logs (app errors, access logs) are used +within days of generation. 30d aggregated (e.g., error-count rollups) covers sprint-level +incident retrospectives. 90d archive is a common compliance floor for operational data. + +**Logs (audit, 90d / 1y / 7y WORM)**: SOC2 Type II requires audit evidence covering the audit +period (typically 1y). ISO 27001 Annex A.12.4 requires log retention that satisfies legal and +regulatory requirements. GDPR Article 17 right-to-erasure does not apply to audit logs where +retention is required by another legal obligation (recital 65). 7-year WORM aligns with +financial audit requirements (e.g., SOX) and is the safe upper bound. +Cross-ref: `signals/audit.md` for WORM immutability requirements and hash-chain tamper evidence. + +**Traces (sampled, 30d)**: Sampled traces (tail-based, typically 1-10% of production traffic) +are the primary debugging artifact. 30d covers cross-sprint incident investigations. No +aggregation applies — traces are point-in-time artifacts, not time-series. + +**Traces (full 100% sample, 3d)**: 100% sampling is expensive. Keep for 3 days only — long +enough to cover a release rollout + initial stability period. Use head-based or tail-based +sampling to reduce to 1-10% thereafter. + +**Profiles (14d)**: Continuous profiling data (Parca, Pyroscope) is used for performance +regression detection immediately after a deploy and for on-call debugging. 14d covers +two sprint cycles. No aggregation standard exists for profiling data yet. + +**Events (30d / 90d)**: Structured events (deployment markers, feature flag changes, SLO +breaches) are retained for 30d raw (covers post-release review). Aggregated event counts +(deploys per day, flag toggles per week) are retained for 90d to support quarterly reviews. + +### D3. Storage Class Configuration Example (Loki) + +```yaml +# loki-config.yaml — example retention per stream selector +# NOTE: table_manager was removed in Loki 3.x. For Loki 3+, configure retention +# via compactor only. The block below applies to Loki 2.8-2.9 legacy. + +# Legacy (Loki 2.x): table_manager +table_manager: + retention_deletes_enabled: true + retention_period: 720h # default 30d + +# Per-stream retention via compactor (Loki 2.8+ and required on Loki 3.x) +compactor: + retention_enabled: true + retention_delete_delay: 2h + +# Stream-level policy in ruler or via label matchers: +# {log_type="audit"} → 61320h (7y) +# {log_type="operational"} → 2160h (90d) +# {log_type="debug"} → 168h (7d) +``` + +Cross-ref: `signals/audit.md` for WORM immutable storage configuration (S3 Object Lock, +GCS Object Hold, Azure Immutable Blob Storage). Cross-ref: `signals/privacy.md` for +GDPR Article 5(1)(e) storage limitation (data not kept longer than necessary for the purpose). + +--- + +## Section E: Pipeline Failure Modes + +| Failure mode | Symptom | Remediation | +|---|---|---| +| Upstream receiver overflow | `otelcol_receiver_refused_*` rising; `send_queue_length` climbing | Add `memory_limiter` processor; scale gateway Collector replicas horizontally | +| Exporter backpressure | `otelcol_processor_queued_retry_send_queue_length` rising; `send_failed_*` incrementing | Check vendor rate limits; tune exporter `retry_on_failure` with exponential backoff; reduce batch size | +| Clock drift spike | Waterfall inversion; `node_clock_drift_ms > 100` alert fires | Run `chronyc makestep` to force immediate re-sync; for persistent drift, check NTP source reachability; escalate to PTP for sub-ms requirements | +| Cardinality bomb | TSDB ingestion lag; query timeouts; vendor bill alert | Identify offending metric with `topk(10, count({__name__!=""}) by (__name__))`, add `filter` processor to drop or `transform` to remove offending label | +| Memory OOM crash | Collector process exits; gap in all signals | `memory_limiter` prevents this if configured; if reached anyway, increase container memory limit or reduce `limit_percentage` | +| Audit log retention violation | Audit records purged before WORM period | Verify S3/GCS Object Lock policy; confirm `retention_period` in log backend matches 7y; cross-ref `signals/audit.md §WORM` | + +### E1. Exporter Retry Configuration + +```yaml +exporters: + otlp: + endpoint: backend:4317 + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s # give up after 5 minutes; prevents infinite queue growth + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 1000 +``` + +### E2. Filter Processor for Cardinality Drop + +```yaml +processors: + filter/drop_high_cardinality: + error_mode: ignore + metrics: + datapoint: + - 'attributes["user.id"] != nil' # drop any datapoint carrying user.id + - 'attributes["request.id"] != nil' # drop request.id label +``` + +Cross-ref: `resources/checklist.md §7 Recovery` for the full recovery procedure checklist. +Cross-ref: `resources/anti-patterns.md §Section C Pipeline` for pipeline anti-patterns. + +--- + +## Section F: Alerts & Dashboards + +### F1. Five Golden Meta-Observability Alerts + +These five alerts MUST be in place before any other observability alerting is considered +reliable. Without them, you cannot trust your alerts. + +**Alert 1 — Pipeline delivery ratio below threshold** + +```promql +# otelcol_pipeline_delivery_ratio — traces +( + sum(rate(otelcol_exporter_sent_spans[5m])) + / + sum(rate(otelcol_receiver_accepted_spans[5m])) +) < 0.99 +``` + +```yaml +# PrometheusRule CRD +- alert: OtelcolDeliveryRatioBelowThreshold + expr: | + ( + sum(rate(otelcol_exporter_sent_spans[5m])) + / + (sum(rate(otelcol_receiver_accepted_spans[5m])) > 0) + ) < 0.99 + for: 5m + labels: + severity: critical + annotations: + summary: "OTel Collector trace delivery ratio below 99%" + description: "{{ $value | humanizePercentage }} of accepted spans are being delivered. Check exporter errors and backpressure queue." +``` + +**Alert 2 — Exporter send failures above 1%** + +```promql +# Ratio of failed exports to sent (any signal) +( + sum(rate(otelcol_exporter_send_failed_spans[5m])) + / + (sum(rate(otelcol_exporter_sent_spans[5m])) > 0) +) > 0.01 +``` + +```yaml +- alert: OtelcolExporterSendFailed + expr: | + ( + sum(rate(otelcol_exporter_send_failed_spans[5m])) + / + (sum(rate(otelcol_exporter_sent_spans[5m])) > 0) + ) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "OTel Collector exporter failures above 1%" + description: "Check vendor rate limits, network connectivity, and exporter authentication." +``` + +**Alert 3 — Node clock drift above 100 ms** + +```yaml +- alert: NodeClockDriftHigh + expr: node_clock_drift_ms > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Node clock drift exceeds 100ms on {{ $labels.instance }}" + description: "Trace waterfall ordering may be unreliable. Run chronyc makestep or check NTP source." +``` + +**Alert 4 — Service approaching cardinality budget** + +```yaml +- alert: MetricCardinalityBudgetExceeded + expr: count({job=~".+"}) by (job) > 4000 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.job }} exceeds 80% of 5000-series cardinality budget" + description: "Identify high-cardinality labels with topk(10, count({job='{{ $labels.job }}'}) by (__name__)). Apply attribute filter." +``` + +**Alert 5 — Audit log retention policy violation** + +This alert cannot be expressed in PromQL alone; implement it as a scheduled policy check in +your compliance tooling or as a metric emitted by a retention audit job: + +```yaml +- alert: AuditLogRetentionViolation + expr: audit_log_retention_days{log_type="audit"} < 2555 # 7 years = 2555 days + for: 1h + labels: + severity: critical + annotations: + summary: "Audit log retention is below the 7-year WORM requirement" + description: "Verify S3 Object Lock / GCS Object Hold policy on audit log bucket. Cross-ref signals/audit.md." +``` + +### F2. Grafana Dashboard Blueprint + +A meta-observability Grafana dashboard MUST include the following panels: + +| Panel | Query summary | Visualization | +|---|---|---| +| Pipeline delivery ratio (traces) | `sent / accepted` rate | Stat (green > 99%, red < 99%) | +| Pipeline delivery ratio (metrics) | same for metric_points | Stat | +| Pipeline delivery ratio (logs) | same for log_records | Stat | +| Receiver accepted vs refused | `rate(accepted)` and `rate(refused)` | Time series | +| Exporter sent vs failed | `rate(sent)` and `rate(failed)` | Time series | +| Queue depth | `send_queue_length` per exporter | Gauge | +| Collector heap usage | `heap_alloc_bytes` vs limit | Gauge | +| Node clock drift (all hosts) | `node_clock_drift_ms` | Heatmap by instance | +| Top cardinality metrics | `topk(10, count(...) by (__name__))` | Table (auto-refresh 5m) | +| Fluent Bit input vs output records | `fluentbit_input_records_total` vs `fluentbit_output_proc_records_total` | Time series | + +The full Jsonnet/YAML implementation of this dashboard belongs in `resources/observability-as-code.md` +(Grafana-as-code section). This section provides the blueprint; that file provides the code. + +--- + +## Cross-References + +| Topic | File | +|---|---| +| Clock discipline normative requirements | `resources/standards.md §Clock Discipline` | +| WORM immutable storage for audit logs | `signals/audit.md` | +| GDPR storage limitation (Art. 5(1)(e)) | `signals/privacy.md` | +| Cardinality anti-patterns (Section B) | `resources/anti-patterns.md` | +| Pipeline anti-patterns (Section C) | `resources/anti-patterns.md` | +| Recovery checklist (§7) | `resources/checklist.md` | +| Dashboard/alert as code | `resources/observability-as-code.md` | +| Two-tier Collector topology | `transport/collector-topology.md` | +| Tail-based sampling configuration | `transport/sampling-recipes.md` | +| SLO burn-rate alerts | `boundaries/slo.md` | +| Incident forensics (6-dimension localization) | `resources/incident-forensics.md` | + +--- + +## Review and Maintenance + +- **Review cadence**: quarterly, aligned with `resources/standards.md`. +- On OTel Collector minor version bump: verify `otelcol_*` metric names have not changed. + Metric names are stable within a major version but have changed between 0.x and 1.x. +- On cloud provider NTP endpoint changes: update Section B2 table. +- On TSDB migration (e.g., Prometheus → VictoriaMetrics): verify cardinality query syntax + in Section C3; VictoriaMetrics uses `/api/v1/status/tsdb` instead of PromQL cardinality queries. +- Owner: SysE primary (see design document Ownership & Quality Gates table). diff --git a/.agents/skills/oma-observability/resources/observability-as-code.md b/.agents/skills/oma-observability/resources/observability-as-code.md new file mode 100644 index 0000000..3dbfe12 --- /dev/null +++ b/.agents/skills/oma-observability/resources/observability-as-code.md @@ -0,0 +1,651 @@ +--- +cross_ref: "meta-observability.md §Section F for pipeline alerts; boundaries/slo.md §5,7 for SLO math" +notes: + - "Versions: Grafonnet 11.x; OTel Operator v1beta1; Terraform Grafana provider ~> 3.0" +--- + +# Observability-as-Code + +## 1. Why Observability-as-Code + +UI-edited dashboards and alerts accumulate silent debt: + +- **No DR / no rollback.** A dashboard overwritten in the Grafana UI cannot be reverted without a backup. +- **No peer review.** Alert thresholds edited live bypass the same change-management gates required for code. +- **Scale problem.** 100+ dashboards across N environments managed by hand drift within weeks. Each environment develops its own undocumented fork. +- **Compliance.** SOC 2 change-management controls require an audit trail for every change to detection and response configuration. Git history provides that trail; Grafana's internal change log does not. Cross-ref `signals/audit.md` for WORM immutability requirements. + +The non-negotiable principle: version every observability artifact — dashboards, alert rules, SLO definitions, collector config — in git, applied via CI/CD. This is design decision D9 in the design document. + +--- + +## 2. Dashboards-as-Code + +### 2.1 Grafana Jsonnet / Grafonnet + +Source: <https://grafana.github.io/grafonnet/> (Grafonnet 11.x, replacing legacy grafonnet-lib) + +Grafonnet is a Jsonnet library that generates Grafana's JSON dashboard model declaratively. Dashboards are composed programmatically — panels, data sources, variables, and annotations are all typed functions. + +```jsonnet +// dashboards/collector-health.jsonnet +local grafonnet = import 'grafonnet/main.libsonnet'; +local dashboard = grafonnet.dashboard; +local panel = grafonnet.panel; +local prometheus = grafonnet.query.prometheus; + +dashboard.new('OTel Collector Health') ++ dashboard.withUid('otelcol-health-v1') ++ dashboard.withRefresh('30s') ++ dashboard.withPanels([ + panel.timeSeries.new('Delivery Ratio — Traces') + + panel.timeSeries.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(rate(otelcol_exporter_sent_spans[5m])) + / (sum(rate(otelcol_receiver_accepted_spans[5m])) > 0) + ||| + ), + ]), +]) +``` + +Render to JSON and apply: + +```bash +jsonnet -J vendor dashboards/collector-health.jsonnet > dist/collector-health.json +# CI step: grafana-cli dashboards import dist/collector-health.json +``` + +Lint Jsonnet in CI: + +```bash +jsonnetfmt --test dashboards/*.jsonnet +``` + +### 2.2 Perses (CNCF Sandbox) + +Source: <https://perses.dev> — CNCF Sandbox (accepted 2023). YAML-first, vendor-neutral dashboard definition targeting a future CNCF standard for dashboards-as-code. Stricter schema than Grafana JSON; import from Grafana JSON is under development. + +```yaml +# dashboards/collector-health.yaml (Perses) +kind: Dashboard +metadata: + name: otelcol-health + project: observability +spec: + duration: 30m + panels: + delivery_ratio: + kind: TimeSeriesChart + spec: + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + query: > + sum(rate(otelcol_exporter_sent_spans[5m])) + / (sum(rate(otelcol_receiver_accepted_spans[5m])) > 0) +``` + +Use Perses when vendor-neutral YAML is a hard requirement (e.g., multi-backend environments). See Section 12 for current maturity notes. + +### 2.3 Datadog Terraform Provider + +```hcl +resource "datadog_dashboard_json" "collector_health" { + dashboard = file("${path.module}/dashboards/collector-health.json") +} +``` + +### 2.4 Honeycomb Terraform Provider + +```hcl +resource "honeycombio_board" "slo_burn_rate" { + name = "SLO Burn Rate" + description = "Multi-window burn-rate overview" +} +``` + +### 2.5 GitOps Workflow for Dashboards + +``` +git PR (dashboard Jsonnet / YAML) + → CI: jsonnet lint + render to JSON + → peer review + → merge to main + → CD (Argo CD / Flux) applies to Grafana via API or ConfigMap +``` + +--- + +## 3. Alerts-as-Code + +### 3.1 PrometheusRule CRD (Prometheus Operator) + +The `PrometheusRule` custom resource (group `monitoring.coreos.com/v1`) is the standard Kubernetes-native format for alert rules when running Prometheus Operator or kube-prometheus-stack. + +Full burn-rate example in Section 6. + +### 3.2 Alertmanager YAML + +Notification routing is versioned alongside alert rules: + +```yaml +# alertmanager-config.yaml +route: + group_by: [alertname, cluster, service] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: slack-critical + routes: + - matchers: + - severity="warning" + receiver: slack-warning + +receivers: + - name: slack-critical + slack_configs: + - api_url: "${SLACK_WEBHOOK_URL}" # injected from Secret; never hardcoded + channel: "#incidents" + title: "{{ .GroupLabels.alertname }}" + text: "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}" +``` + +### 3.3 Terraform Grafana Provider (Grafana Cloud Alerts) + +Source: <https://github.com/grafana/terraform-provider-grafana> + +```hcl +resource "grafana_rule_group" "slo_burn_rate" { + name = "slo-burn-rate" + folder_uid = grafana_folder.observability.uid + interval_seconds = 60 + + rule { + name = "SloBurnRateFast" + condition = "A" + + data { + ref_id = "A" + datasource_uid = var.prometheus_datasource_uid + model = jsonencode({ + expr = "(sum(rate(http_requests_total{status=~\"5..\"}[1h])) / sum(rate(http_requests_total[1h]))) / (1 - 0.999) > 14.4" + }) + } + + annotations = { summary = "Fast burn: SLO budget burning at 14.4x rate" } + labels = { severity = "critical" } + } +} +``` + +### 3.4 GitHub Actions Alert Pipeline + +```yaml +# .github/workflows/alerts.yaml +on: + push: + paths: ["alerts/**"] + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Check PrometheusRule YAML + run: promtool check rules alerts/*.yaml + - name: Lint Alertmanager config + run: amtool check-config alertmanager-config.yaml + apply: + needs: validate + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Apply via kubectl + run: kubectl apply -f alerts/ +``` + +--- + +## 4. SLO-as-Code + +### 4.1 OpenSLO YAML + +Source: <https://openslo.com> — community-driven (not CNCF). Adopted by Sloth, Pyrra, and Nobl9. Cross-ref `boundaries/slo.md §5` for the full OpenSLO spec example. + +```yaml +apiVersion: openslo.com/v1 +kind: SLO +metadata: + name: checkout-availability +spec: + service: checkout + sloType: Request-Based + objectives: + - target: 0.999 + window: 28d + indicator: + spec: + ratioMetric: + good: + metricSource: + type: Prometheus + spec: + query: sum(rate(http_requests_total{service="checkout",status=~"2..|3.."}[{{.Window}}])) + total: + metricSource: + type: Prometheus + spec: + query: sum(rate(http_requests_total{service="checkout"}[{{.Window}}])) +``` + +### 4.2 Sloth: YAML to PrometheusRule + +Source: <https://sloth.dev> — generates multi-window burn-rate `PrometheusRule` CRDs from a concise YAML definition. + +```yaml +# sloth/checkout-slo.yaml +version: prometheus/v1 +service: checkout +slos: + - name: availability + objective: 99.9 + description: "Checkout HTTP availability" + sli: + events: + error_query: sum(rate(http_requests_total{service="checkout",status=~"5.."}[{{.window}}])) + total_query: sum(rate(http_requests_total{service="checkout"}[{{.window}}])) + alerting: + page_alert: + labels: { severity: critical } + ticket_alert: + labels: { severity: warning } +``` + +```bash +sloth generate -i sloth/checkout-slo.yaml | kubectl apply -f - +``` + +### 4.3 Pyrra: Kubernetes CRD Operator + +Source: <https://github.com/pyrra-dev/pyrra> — Kubernetes operator that reconciles `ServiceLevelObjective` CRDs directly into `PrometheusRule` + recording rules. + +```yaml +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: checkout-availability + namespace: monitoring +spec: + target: "99.9" + window: 28d + serviceMonitorSelector: {} + indicator: + http: + selector: + matchLabels: { job: checkout } + errorsSelector: + matchExpressions: + - { key: code, operator: In, values: ["5xx"] } +``` + +Pyrra reconciles this into recording rules and PrometheusRule alerts automatically — no separate generation step. + +--- + +## 5. OTel Collector-as-Code + +### 5.1 OTel Operator: OpenTelemetryCollector CRD + +Source: <https://github.com/open-telemetry/opentelemetry-operator> + +The `OpenTelemetryCollector` CRD (`v1beta1`, beta) is the Kubernetes-native way to manage collector config as code. Four `spec.mode` values map to the four deployment strategies — cross-ref `transport/collector-topology.md §1`: + +```yaml +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otelcol-daemonset + namespace: observability +spec: + mode: daemonset # deployment | daemonset | statefulset | sidecar + image: otel/opentelemetry-collector-contrib:0.122.1 # pin to current contrib release; verify tag exists via docker pull before apply + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + memory: {} + processors: + memory_limiter: + check_interval: 1s + limit_percentage: 75 + spike_limit_percentage: 20 + batch: {} + exporters: + otlp: + endpoint: otelcol-gateway:4317 + tls: + insecure: false + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp] + metrics: + receivers: [otlp, hostmetrics] + processors: [memory_limiter, batch] + exporters: [otlp] +``` + +The `Instrumentation` CR enables auto-injection into application pods. Cross-ref `layers/mesh.md §OTel Operator Instrumentation CR`. + +### 5.2 Terraform: Helm Release or kubernetes_manifest + +```hcl +resource "helm_release" "otel_operator" { + name = "opentelemetry-operator" + repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" + chart = "opentelemetry-operator" + namespace = "observability" + version = "0.78.0" # pin to current stable; verify via `helm search repo opentelemetry/opentelemetry-operator --versions` + + set { + name = "manager.collectorImage.repository" + value = "otel/opentelemetry-collector-contrib" + } +} + +resource "kubernetes_manifest" "otelcol_daemonset" { + manifest = yamldecode(file("${path.module}/manifests/otelcol-daemonset.yaml")) + depends_on = [helm_release.otel_operator] +} +``` + +### 5.3 CI Validation for Collector Config + +```bash +# otelcol validate — verifies config syntax and component availability +otelcol validate --config otelcol-config.yaml +``` + +Add to CI before any collector config change merges. + +--- + +## 6. Burn-Rate Multi-Window Alert (Concrete PromQL) + +Multi-window burn-rate alerting from `boundaries/slo.md §7`. Two alert pairs protect against fast budget exhaustion and slow invisible drain. + +- **Fast burn**: 2% budget consumed in 1h (14.4x rate), gated by a 5m short window to suppress transient spikes. +- **Slow burn**: 5% budget consumed in 6h (6x rate), gated by a 30m short window. + +Error budget denominator for 99.9% SLO: `1 - 0.999 = 0.001`. + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: checkout-slo-burn-rate + namespace: monitoring + labels: + prometheus: kube-prometheus + role: alert-rules +spec: + groups: + - name: checkout.slo.burn_rate + interval: 30s + rules: + # Fast burn — 1h / 5m windows + - alert: CheckoutSLOBurnRateFast + expr: | + ( + sum(rate(http_requests_total{service="checkout",status=~"5.."}[1h])) + / sum(rate(http_requests_total{service="checkout"}[1h])) + ) / (1 - 0.999) > 14.4 + and + ( + sum(rate(http_requests_total{service="checkout",status=~"5.."}[5m])) + / sum(rate(http_requests_total{service="checkout"}[5m])) + ) / (1 - 0.999) > 14.4 + for: 2m + labels: + severity: critical + slo: checkout-availability + annotations: + summary: "Checkout SLO fast burn: budget exhausting in < 1h" + description: > + Error rate is {{ $value | humanizePercentage }} of the error budget rate. + At this pace the 28-day budget is exhausted in under 1 hour. + runbook: "https://wiki.example.com/runbooks/checkout-slo-fast-burn" + + # Slow burn — 6h / 30m windows + - alert: CheckoutSLOBurnRateSlow + expr: | + ( + sum(rate(http_requests_total{service="checkout",status=~"5.."}[6h])) + / sum(rate(http_requests_total{service="checkout"}[6h])) + ) / (1 - 0.999) > 6 + and + ( + sum(rate(http_requests_total{service="checkout",status=~"5.."}[30m])) + / sum(rate(http_requests_total{service="checkout"}[30m])) + ) / (1 - 0.999) > 6 + for: 15m + labels: + severity: warning + slo: checkout-availability + annotations: + summary: "Checkout SLO slow burn: budget exhausting in < 5 days" + description: > + Error rate is {{ $value | humanizePercentage }} of the error budget rate. + At this pace the 28-day budget is exhausted in under 5 days. + runbook: "https://wiki.example.com/runbooks/checkout-slo-slow-burn" +``` + +--- + +## 7. GitOps Integration + +### 7.1 Argo CD + +Source: <https://argoproj.github.io/cd/> + +Use the app-of-apps pattern to manage the observability stack as a first-class application: + +``` +observability-root (App of Apps) +├── prometheus-stack (App) +├── otel-operator (App) +├── otelcol-daemonset (App) ← OpenTelemetryCollector CRD +├── otelcol-gateway (App) +├── dashboards (App) ← Grafonnet rendered JSON in ConfigMap +└── alert-rules (App) ← PrometheusRule CRDs +``` + +Argo CD reconciles git state to the cluster on every commit. Drift is detected and surfaced in the Argo CD UI. + +### 7.2 Flux (CNCF Graduated) + +Source: <https://fluxcd.io> + +Flux `Kustomization` resources watch git paths and apply them to the cluster. Flagger (a Flux ecosystem project) handles progressive delivery gates — cross-ref `boundaries/release.md` for canary/blue-green integration. + +```yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: observability-alerts + namespace: flux-system +spec: + interval: 5m + path: ./alerts + prune: true + sourceRef: + kind: GitRepository + name: observability-repo +``` + +### 7.3 Self-Observability of the GitOps Stack + +The observability stack is itself observed via meta-observability. Cross-ref `meta-observability.md §Section A` for the pipeline self-health metrics that must remain green to trust any downstream alert or dashboard. Argo CD and Flux both expose Prometheus metrics; scrape them with the same DaemonSet collector. + +--- + +## 8. Immutable Review Workflow + +All observability changes flow through the same review gate as application code: + +| Step | Tool | What is checked | +|------|------|----------------| +| Open PR | GitHub / GitLab | Diff shows intent | +| CI: PromQL / rule lint | `promtool check rules` | PrometheusRule validity | +| CI: Collector config lint | `otelcol validate` | Collector YAML syntax and component graph | +| CI: Jsonnet lint | `jsonnetfmt --test` | Grafonnet formatting | +| CI: Dashboard lint | Grafana dashboard linter | Panel best practices (no orphan panels, datasource vars set) | +| Peer review | GitHub CODEOWNERS | At least one SRE approval | +| Merge → main | CI gate | All checks must pass | +| CD apply | Argo CD / Flux | Reconciles to cluster; Argo CD marks Synced | + +Audit trail: git history is the authoritative change log. Cross-ref `signals/audit.md` for the WORM immutability requirement on audit logs generated from this pipeline. + +--- + +## 9. Secrets Management for Observability Stack + +Backend API keys, Grafana service account tokens, and OAuth client secrets MUST NOT be committed to git — even in encrypted form inside dashboard YAML. + +| Pattern | Tool | Notes | +|---------|------|-------| +| Kubernetes secrets from Vault | External Secrets Operator | Syncs Vault path → Kubernetes Secret | +| Encrypted secrets in git | Sealed Secrets (Bitnami) | `SealedSecret` CRD; encrypted with cluster public key | +| Cloud-native secret store | AWS Secrets Manager, GCP Secret Manager | Pull via ESO provider | + +Reference secrets in `OpenTelemetryCollector` CRD and Terraform via `secretKeyRef`, never inline: + +```yaml +env: + - name: GRAFANA_API_TOKEN + valueFrom: + secretKeyRef: + name: grafana-credentials + key: api-token +``` + +--- + +## 10. Environment Separation + +| Concern | Approach | +|---------|----------| +| Separate namespaces | `observability-dev`, `observability-staging`, `observability-prod` | +| Dashboard parameterization | Grafana template variable `$env` filters all queries; built into Grafonnet at render time | +| Kustomize overlays | Base config + `overlays/dev`, `overlays/prod` for resource limits, replica counts, retention | +| Helm values | `values-dev.yaml`, `values-prod.yaml` per environment | + +Never duplicate dashboard JSON per environment. Parameterize via variables; let the environment label filter do the work. + +--- + +## 11. Testing Observability-as-Code + +| Test type | Method | Tooling | +|-----------|--------|---------| +| Dashboard preview | Import rendered JSON into ephemeral Grafana via API; visual check | `grafana-cli`, Grafana HTTP API | +| Alert firing validation | Inject synthetic metric spike (set error counter to trigger threshold); confirm alert fires and routes | `promtool test rules` with `rule_test` fixtures | +| Collector config unit test | Feed sample OTLP telemetry fixtures through `otelcol` in test mode | `otelcol_test` package; `file` exporter to assert output | + +```yaml +# promtool rule test — alerts/checkout_slo_test.yaml +rule_files: + - checkout-slo-burn-rate.yaml + +tests: + - interval: 1m + input_series: + - series: 'http_requests_total{service="checkout",status="500"}' + values: "0+100x10" # 100 errors/min for 10 minutes + - series: 'http_requests_total{service="checkout",status="200"}' + values: "0+900x10" # 900 success/min + alert_rule_test: + - eval_time: 10m + alertname: CheckoutSLOBurnRateFast + exp_alerts: + - exp_labels: + severity: critical + slo: checkout-availability +``` + +--- + +## 12. Perses CNCF Sandbox Specifics + +Source: <https://perses.dev> | CNCF Sandbox status as of 2026-Q2. + +- YAML schema is stricter and more opinionated than Grafana JSON; dashboards are more portable but migration from Grafana JSON requires manual mapping (import tooling is under active development). +- Perses CLI (`percli`) provides `apply`, `get`, `delete`, and `lint` subcommands mirroring `kubectl`. +- Use Perses when: the organization requires a vendor-neutral CNCF-blessed format, or is evaluating a Grafana alternative. +- Do not use Perses as the sole dashboard-as-code solution in production yet unless the team accepts the current schema instability risk. Track promotion to CNCF Incubating as the stability signal. + +--- + +## 13. Matrix Coverage Note + +Observability-as-code is cross-cutting: it covers every row in `matrix.md` (all layers, all boundaries, all signals). It has no dedicated matrix row. Think of it as the delivery mechanism for everything else in the skill — the how, not the what. + +--- + +## 14. Anti-Patterns + +Append to `anti-patterns.md §Section I: As-Code and GitOps`: + +| # | Anti-pattern | Consequence | Fix | +|---|-------------|-------------|-----| +| I-1 | Dashboards edited directly in Grafana UI | No rollback; peer review bypassed; drift from git within hours | All edits via PR; Grafana provisioning blocks UI edits (`allowUiUpdates: false`) | +| I-2 | Alert rules without automated tests | False alarms or silent failures reach production undetected | Add `promtool test rules` fixtures to CI for every new alert | +| I-3 | Secrets (API tokens, DSNs) committed in dashboard YAML or `values.yaml` | Credentials exposed in git history permanently | External Secrets Operator or Sealed Secrets; never inline | +| I-4 | No CI validation for PrometheusRule or Collector config | Invalid YAML / broken PromQL deployed to production; alerts silently stop firing | `promtool check rules` + `otelcol validate` in CI gate | +| I-5 | Environment-specific dashboards duplicated instead of parameterized | N copies of the same dashboard diverge; maintenance multiplied by N | Template variable `$env`; single parameterized source of truth | + +--- + +## Cross-References + +| Topic | File | +|-------|------| +| SLO math and error budget formula | `boundaries/slo.md §4` | +| OpenSLO YAML full example | `boundaries/slo.md §5` | +| Burn-rate multi-window alert math | `boundaries/slo.md §7` | +| OTel Operator four deployment modes | `transport/collector-topology.md §1` | +| Instrumentation CR for auto-injection | `layers/mesh.md §OTel Operator Instrumentation CR` | +| Pipeline self-health alerts (meta) | `meta-observability.md §Section F` | +| WORM immutability for audit trail | `signals/audit.md` | +| Progressive delivery (Flagger) | `boundaries/release.md` | +| Anti-patterns full list | `anti-patterns.md` | + +--- + +## Primary Sources + +- Grafonnet 11.x: <https://grafana.github.io/grafonnet/> +- Perses: <https://perses.dev> +- PrometheusRule CRD: <https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/> +- Terraform Grafana provider: <https://github.com/grafana/terraform-provider-grafana> +- OTel Operator: <https://github.com/open-telemetry/opentelemetry-operator> +- Argo CD: <https://argoproj.github.io/cd/> +- Flux: <https://fluxcd.io> +- OpenSLO: <https://openslo.com> +- Sloth: <https://sloth.dev> +- Pyrra: <https://github.com/pyrra-dev/pyrra> diff --git a/.agents/skills/oma-observability/resources/signals/audit.md b/.agents/skills/oma-observability/resources/signals/audit.md new file mode 100644 index 0000000..cfacf4b --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/audit.md @@ -0,0 +1,296 @@ +--- +otel_semconv: "1.27.0 (2024-11); security.* namespace: Development/proposed" +tools: + - "Falco: v0.38+ (CNCF Graduated); rekor: v1.x (sigstore)" +--- + +# Audit Signal Reference + +## 1. Scope & Distinction + +Audit trails answer **who did what when** — immutable evidence for compliance, legal hold, and forensic investigation. They are a distinct signal from both operational logs (`logs.md`) and privacy records (`privacy.md`). + +### Mutability model: the deliberate opposite of privacy + +| Dimension | Audit | Privacy | +|------------------|----------------------------------------|----------------------------------------| +| Retention goal | Append-only, immutable, 7y+ minimum | Collect less, delete on request (GDPR Art. 17) | +| Storage model | WORM — cannot be deleted or modified | Erasable — must honour right to erasure | +| Consumers | Auditors, legal, compliance officers | Data subjects, DPO, engineering | +| Default posture | Keep everything, forever | Keep nothing, unless justified | + +This is why audit and privacy are separate files in this skill (design decision D5). Merging them would create contradictory retention requirements in a single data store. + +### Distinction from operational logs (`logs.md`) + +| Dimension | Operational logs | Audit events | +|-----------------|-------------------------------------|----------------------------------------| +| Retention | 7–90 days hot | 7 years, tiered (Section 7) | +| Consumers | On-call engineers, SRE | Auditors, legal, compliance | +| Mutability | May be rotated and purged | WORM — immutable after write | +| Storage | Loki / Elasticsearch / ClickHouse | WORM object store + compliance appliance | +| Primary tools | Fluent Bit, OTel Collector | Falco, auditd, pgaudit, audit pipeline | + +Cross-ref `../meta-observability.md §Retention Matrix` for unified policy across all signals. + +--- + +## 2. Regulatory Drivers + +| Framework | Relevant controls | Audit requirement | +|-----------|-------------------|-------------------| +| **SOC 2 (Type I/II)** | CC7.2 monitoring activities; CC7.3 incident response | Immutable audit trail; tamper evidence; access review | +| **ISO/IEC 27001:2022** | A.8.15 logging; A.8.16 monitoring; A.5.25 audit logging | Log protection, log administrator access control | +| **ISO/IEC 27002:2022** | 8.15 logging controls | Protect logs from tampering and unauthorized access | +| **HIPAA Security Rule** | §164.312(b) audit controls | Audit logs retained ≥ 6 years | +| **PCI DSS v4.0** | Requirement 10 — track and monitor all access to cardholder data | 1 year online + offline retention; tamper detection | +| **GDPR Art. 30** | Records of processing activities | Audit of data processing operations | + +Cross-ref `../standards.md §ISO/IEC 27001/27002` for the normative standards baseline. + +Sources: [iso.org/standard/27001](https://www.iso.org/standard/27001) · [aicpa.org SOC 2](https://www.aicpa.org/soc2) · [hhs.gov HIPAA](https://www.hhs.gov/hipaa/for-professionals/security/index.html) · [pcisecuritystandards.org PCI DSS v4.0](https://www.pcisecuritystandards.org) + +--- + +## 3. Audit Event Categories + +| Category | Examples | Regulatory driver | +|----------|----------|-------------------| +| **Authentication** | Login success/failure, MFA challenge, password change, session create/destroy | SOC 2 CC7.2, ISO A.8.15 | +| **Authorization** | Permission grants, role changes, scope escalation, RBAC mutations | SOC 2 CC7.2, PCI DSS 10.2 | +| **Data access** | Read/create/update/delete of regulated data (PII, CHD, PHI) | HIPAA §164.312(b), PCI DSS 10.2 | +| **Administrative** | Config changes, user provisioning, API key lifecycle, certificate rotation | ISO A.5.25, SOC 2 CC7.3 | +| **Security events** | IDS/IPS alerts, anomalous kernel syscalls (Falco), policy violations | SOC 2 CC7.3, ISO A.8.16 | +| **System events** | Kernel audit (Linux auditd), container lifecycle events | ISO A.8.15; cross-ref `logs.md §OS-level log sources` | + +--- + +## 4. Required Attributes per Audit Event + +Every audit event MUST carry these fields. Map to OTel `security.*` semconv namespace (Development/proposed). + +| Attribute | Type | Description | +|-----------|------|-------------| +| `user.id` | string | Authenticated identity — pseudonymized or hashed; never plain email (PII) | +| `actor.type` | enum | `user` \| `service_account` \| `system` | +| `action` | string | Verb: `read`, `write`, `delete`, `approve`, `login`, `logout` | +| `resource.type` | string | What was acted on: `order`, `user_record`, `api_key`, `rbac_role` | +| `resource.id` | string | Identifier of the affected resource | +| `event.outcome` | enum | `success` \| `failure` \| `denied` | +| `ip.address` | string | Source IP — may be PII; cross-ref `privacy.md §IP addresses` | +| `timestamp` | ISO 8601 UTC | Event time; required for chain ordering (Section 6) | +| `trace_id` | 32-char hex | Correlation with operational trace; cross-ref `../incident-forensics.md` | + +```json +{ + "timestamp": "2026-04-21T09:15:32.847Z", + "actor.type": "user", + "user.id": "sha256:a3f8c2...", + "action": "delete", + "resource.type": "user_record", + "resource.id": "rec_9f2k1", + "event.outcome": "success", + "ip.address": "203.0.113.42", + "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736", + "service.name": "user-management-api", + "deployment.environment": "production" +} +``` + +--- + +## 5. Immutable WORM Storage (Write-Once-Read-Many) + +WORM storage is required for SOC 2 CC7.2, HIPAA §164.312(b), and PCI DSS Requirement 10. + +| Platform | Mechanism | Key constraint | +|----------|-----------|----------------| +| **AWS S3** | Object Lock — Compliance mode | Cannot delete even by root or AWS support | +| **GCS** | Retention Policy with locked bucket | Lock is irreversible once applied | +| **Azure Blob** | Immutability policy (time-based retention) | Legal hold override available | +| **On-premises** | WORM tape, compliance appliances | Hardware write-protect; chain of custody required | + +S3 Object Lock Compliance mode policy example: + +```json +{ + "ObjectLockConfiguration": { + "ObjectLockEnabled": "Enabled", + "Rule": { + "DefaultRetention": { + "Mode": "COMPLIANCE", + "Years": 7 + } + } + } +} +``` + +`COMPLIANCE` mode prevents deletion by any principal, including the AWS account root. Do not use `GOVERNANCE` mode for regulatory audit logs — it allows privileged override. + +--- + +## 6. Tamper Evidence (Cryptographic Integrity) + +WORM prevents deletion; tamper evidence proves no records were silently modified or omitted. + +**Hash chain:** each event record includes `prev_hash` — the SHA-256 of the previous event. Any insertion, deletion, or modification breaks the chain. + +``` +event_N.hash = SHA256(event_N.payload + event_{N-1}.hash) +``` + +**Merkle root anchoring:** daily, compute the Merkle root of all events in the period. Anchor this root to an external transparency log — detached from the audit system — so the anchoring timestamp is independently verifiable. + +| Tool | Type | Notes | +|------|------|-------| +| **rekor** (sigstore) | OSS transparency log | CNCF project; append-only; publicly verifiable | +| **AWS QLDB** | Managed ledger DB | Cryptographic journal; AWS-proprietary | +| **IBM LinuxONE** | Hardware-backed ledger | Enterprise on-prem | + +Source: [github.com/sigstore/rekor](https://github.com/sigstore/rekor) + +**Verification schedule:** run chain integrity check weekly (automated), and before any compliance audit. Verification failure is a CRITICAL security event — trigger incident response. + +--- + +## 7. 7-Year Retention Policy + +Automated tiering via object lifecycle policy satisfies all regulatory minimums (HIPAA 6y, PCI 1y, SOC 2 varies) with a single 7-year baseline. + +| Tier | Duration | Storage | Cost tier | +|------|----------|---------|-----------| +| Hot (recent, searchable) | 90 days | Elasticsearch / Loki | $$$ | +| Warm (searchable, compressed) | 1 year | ClickHouse / archive index | $$ | +| Cold (WORM, immutable) | 7 years | S3 Glacier Deep Archive (Object Lock Compliance) | $ | + +**Legal hold:** when an active investigation or litigation hold is active, suppress automated expiry for all affected records regardless of tier. Implement via S3 Object Lock legal hold or equivalent. + +**Lifecycle policy:** apply S3 Lifecycle rules to transition objects automatically. Compliance mode Object Lock must be set at write time — it cannot be applied retroactively. + +--- + +## 8. Falco Integration (CNCF Graduated) + +Falco provides runtime security detection: anomalous kernel syscalls, container escapes, file access violations, and Kubernetes API abuse. + +Source: [falco.org](https://falco.org) + +**Deployment:** Falco DaemonSet on every node. Output JSON events → audit pipeline. + +Custom Falco rule example (detect secret access in production namespace): + +```yaml +- rule: K8s Secret Access in Production + desc: Detect any read of a Kubernetes secret in the production namespace + condition: > + ka.verb=get and ka.target.resource=secrets + and ka.target.namespace=production + and not ka.user.name startswith "system:" + output: > + Secret accessed (user=%ka.user.name secret=%ka.target.name + ns=%ka.target.namespace ua=%ka.user-agent) + priority: WARNING + source: k8saudit + tags: [audit, pci_dss, hipaa] +``` + +**Pipeline integration:** Falco JSON output → `filelogreceiver` (OTel Collector) → audit logs pipeline → WORM cold tier. Apply the same hash chain enrichment as application audit events before writing. + +--- + +## 9. Kubernetes Audit Logs + +Kubernetes API server audit logs record every API call — essential for cluster-admin accountability, secret access auditing, and RBAC change tracking. + +| Audit level | Records | Use case | +|-------------|---------|----------| +| `None` | Nothing | Exclude noisy, low-value paths | +| `Metadata` | Method, URL, user, timestamp | Default for most resources | +| `Request` | + request body | Sensitive mutations (RBAC, secrets) | +| `RequestResponse` | + response body | High-value targets (cluster-admin actions) | + +Feed Kubernetes audit logs into the audit pipeline separately from operational logs. Tag with `source: k8s_apiserver` for routing. + +--- + +## 10. Database Audit Logs + +| Database | Extension / feature | Notes | +|----------|---------------------|-------| +| PostgreSQL | `pgaudit` extension | Log SELECT, DDL, DML per role | +| MySQL | Enterprise Audit plugin | JSON output; filter by user/schema | +| MongoDB | Audit system (Enterprise) | Filter by action type and collection | +| AWS RDS | Built-in audit logging | Enable via parameter group | +| GCP Cloud SQL | `cloudsql.enable_pgaudit` | Same pgaudit under the hood | +| Azure SQL | Unified Audit Log | Native, sends to Storage/Log Analytics | + +Database audit events feed the same audit pipeline as application events. Normalize to the attribute schema in Section 4 before storage. + +--- + +## 11. Access Control on Audit Data + +Audit data must be readable by auditors and legal — and not writable by anyone after initial ingest. + +- Separate RBAC role: `audit-reader` is distinct from `operations-engineer` and `developer` +- Auditors have read-only access, scoped to approved time ranges +- No single engineer can both produce audit events and modify audit storage +- Cross-ref `privacy.md §Backend RBAC` for principle of least privilege patterns + +--- + +## 12. Cross-Signal Correlation + +Every audit event carries `trace_id`. During an incident: + +1. Start from the audit event (who did what) +2. Pivot via `trace_id` to the operational trace (what the system did) +3. Join with logs (`logs.md §Trace ID Injection Rules`) for execution detail +4. Reference `../incident-forensics.md` for the full MRA playbook + +Audit events are the authoritative source of truth for incident timelines — operational traces provide the execution context. + +--- + +## 13. Matrix Coverage: Audit Column + +Cells from `../matrix.md` owned by this file: + +| Layer | Boundary | Status | Notes | +|-------|----------|--------|-------| +| L3-network | any | ✅ | VPC flow audit trail — who connected to what | +| L4-transport | any | ⚠️ | Limited: connection-level only, no payload | +| L7-application | multi-tenant | ✅ | Per-tenant audit trail; supports GDPR data subject right of access | +| release | any | ✅ | Deployment audit: who deployed what when (actor, SHA, timestamp) | +| privacy | audit | overlap | See `privacy.md`; audit records data processing; privacy governs erasure | + +--- + +## 14. Anti-patterns + +Append candidates for `../anti-patterns.md §Section F Security & Compliance`: + +| ID | Anti-pattern | Fix | +|----|--------------|-----| +| A-AU1 | Mutable audit storage (not WORM) | Apply S3 Object Lock Compliance mode or equivalent at bucket creation | +| A-AU2 | No tamper evidence (no hash chain or notary) | Implement hash chain per Section 6; anchor Merkle root to rekor | +| A-AU3 | PII in audit events without redaction | Hash or pseudonymize `user.id`; evaluate `ip.address` per `privacy.md` | +| A-AU4 | Shared RBAC for operations and audit readers | Create separate `audit-reader` role; enforce in IaC | +| A-AU5 | Retention below regulatory minimum | Use 7-year baseline; apply lifecycle policy automatically | +| A-AU6 | Kubernetes audit logs routed to operational log store | Separate pipeline: K8s audit → WORM cold tier | + +--- + +## 15. References + +1. ISO/IEC 27001:2022 — <https://www.iso.org/standard/27001> +2. ISO/IEC 27002:2022 — <https://www.iso.org/standard/27002> +3. AICPA SOC 2 Common Criteria — <https://www.aicpa.org/soc2> +4. HIPAA Security Rule §164.312(b) — <https://www.hhs.gov/hipaa/for-professionals/security/index.html> +5. PCI DSS v4.0 Requirement 10 — <https://www.pcisecuritystandards.org> +6. GDPR Art. 30 — <https://gdpr-info.eu/art-30-gdpr/> +7. Falco — <https://falco.org> +8. sigstore rekor — <https://github.com/sigstore/rekor> +9. OTel security semconv (proposed) — <https://opentelemetry.io/docs/specs/semconv/attributes-registry/security/> +10. `../standards.md` · `../matrix.md` · `../meta-observability.md` · `../incident-forensics.md` · `privacy.md` · `logs.md` diff --git a/.agents/skills/oma-observability/resources/signals/cost.md b/.agents/skills/oma-observability/resources/signals/cost.md new file mode 100644 index 0000000..2e30373 --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/cost.md @@ -0,0 +1,313 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +specs: + - "FOCUS Spec: 1.0 (FinOps Open Cost and Usage Specification)" +notes: + - "OpenCost: CNCF Incubating (advanced 2024-10-31)" +--- + +# Cost Signal + +## 1. Scope & Boundary + +Cost is the 6th signal in the `oma-observability` skill (design decision D4: cost as 1st-class signal, standalone). It sits alongside metrics, logs, traces, profiles, audit, and privacy in the 4 × 4 × 7 coverage matrix. + +**In scope:** +- Cost as a telemetry signal: metric surface (OpenCost Prometheus exposition), attribution rules, and unit economics +- Kubernetes-native cost allocation via OpenCost (CNCF Incubating) +- Per-tenant cost attribution (B2B SaaS) and per-feature cost deltas +- LLM/AI token-cost observability at the span attribute level +- Cross-cloud egress cost attribution +- Retention policy and access-control rules for cost data +- Cardinality rules specific to cost labels + +**Out of scope:** +- Full FinOps strategy: procurement, Reserved Instances / Savings Plans, commitment analysis, contract negotiation — see [FinOps Foundation framework](https://www.finops.org/framework/) +- Full LLM observability (prompt versioning, evals, model routing economics) — see Langfuse, Arize Phoenix, LangSmith +- Cluster cost optimization (rightsizing, spot orchestration) — see Kubecost commercial tier or CloudZero + +**Distinct boundary with `../boundaries/multi-tenant.md §Cost Attribution`:** that file uses cost as a _routing signal_ (tenant chargeback, showback, residency). This file defines cost as a _telemetry signal_ — what to collect, how to attribute it, and how to store it. Cross-reference is mandatory. + +--- + +## 2. Why Cost Is a 1st-Class Signal + +Cost differs from metrics, logs, and traces in three structural ways: + +| Dimension | Metrics / Logs / Traces | Cost | +|-----------|------------------------|------| +| Data source | Instrumented applications + infra agents | Cloud billing APIs + Kubernetes allocation engine | +| Primary consumer | Engineering, SRE | Finance, Platform Engineering | +| Retention requirement | 30–90 days typical | 2 years (financial audit compliance) | +| Granularity | Per-request, sub-second | Per-hour or per-day billing aggregates | + +Unit economics is the motivation: understanding `$ per request`, `$ per tenant`, and `$ per feature` requires correlating cloud billing data with application-level telemetry. Neither data source alone is sufficient. + +FinOps Foundation identifies cost visibility as a foundational practice across three phases: Inform → Optimize → Operate. The Inform phase is entirely an observability problem — cost data must be collected, attributed, and surfaced before optimization can occur. + +--- + +## 3. OpenCost (CNCF Incubating) + +Sources: +- <https://www.opencost.io/> +- <https://www.cncf.io/blog/2024/10/31/opencost-advances-to-cncf-incubator/> +- <https://github.com/opencost/opencost> + +OpenCost is the open-source Kubernetes cost monitoring standard. It advanced to CNCF Incubating status on 2024-10-31. The OSS core is maintained by IBM Kubecost (post-acquisition) and the community. Kubecost's commercial tier adds UI, recommendations, and multi-cluster federation on top of the OpenCost core. + +### 3.1 Architecture + +``` +Cloud Billing API (AWS CUR / GCP Billing Export / Azure Cost Management) + | + v ++----------------+ /metrics (Prometheus format) +| OpenCost Pod | -----> Prometheus scrape -----> OTel Collector prometheusreceiver +| (namespace: | | +| opencost) | v ++----------------+ Your TSDB backend + | (Thanos / Mimir / VictoriaMetrics) + v +Kubernetes API (node prices, pod labels, PVC claims) +``` + +The OpenCost pod reads node price data from a cloud-provider price configmap (or the cloud billing API directly), reads Kubernetes resource allocations from the kubelet and API server, and exposes Prometheus metrics at port 9003. + +### 3.2 Key Metrics Exposed + +| Metric | Type | Unit | Description | +|--------|------|------|-------------| +| `opencost_namespace_cost_total` | Counter | USD | Cumulative cost per Kubernetes namespace | +| `opencost_workload_cost_total` | Counter | USD | Cumulative cost per Deployment / StatefulSet | +| `opencost_cpu_cost` | Gauge | USD/hr | CPU allocation cost, current window | +| `opencost_ram_cost` | Gauge | USD/hr | Memory allocation cost, current window | +| `opencost_network_cost` | Gauge | USD/hr | Network egress cost attribution | +| `opencost_storage_cost` | Gauge | USD/hr | PVC / persistent storage cost | +| `opencost_load_balancer_cost` | Gauge | USD/hr | Cloud load balancer cost | +| `opencost_cluster_management_cost` | Gauge | USD/hr | Managed control plane fee (GKE, EKS, AKS) | + +### 3.3 Scrape Configuration (OTel Collector) + +```yaml +receivers: + prometheus: + config: + scrape_configs: + - job_name: opencost + scrape_interval: 60s + static_configs: + - targets: ["opencost.opencost.svc.cluster.local:9003"] +``` + +### 3.4 Allocation Model + +Three allocation categories: compute (CPU + RAM), storage (PVC), and network (egress bytes). Each workload receives a share of node cost proportional to resource requests. Idle cost distributes across active workloads or a dedicated `__idle__` namespace via configurable rules. Cross-cloud pricing is supplied via a price configmap; on-prem environments set custom `$/CPU-hr` and `$/GB-RAM-hr` values. + +--- + +## 4. Cost Attribution by Dimension + +| Dimension | Kubernetes mechanism | Label / attribute | +|-----------|---------------------|-------------------| +| Namespace / team | Namespace label → team ownership mapping | `namespace`, `team` | +| Workload | Deployment / StatefulSet / DaemonSet name | `workload`, `controller_kind` | +| Tenant (B2B SaaS) | `tenant.id` label on pods + W3C Baggage propagation | `tenant_id` | +| Feature | Custom pod label (`feature.name`) | `feature_name` | +| Per-request (LLM) | `gen_ai.cost.total_usd` span attribute | trace attribute, not metric label | + +**Tenant attribution** is the primary B2B SaaS use case: tag every pod with `tenant.id` at deploy time and OpenCost aggregates cost by that label automatically. Cross-ref `../boundaries/multi-tenant.md §Cost Attribution` for chargeback and showback patterns. **Feature attribution** uses a custom pod label (`feature.name`); cross-ref `../boundaries/release.md` for A/B cost comparison. **Per-request attribution** for LLM workloads uses span attributes only — writing cost to a metric label at request granularity causes a cardinality explosion (see Section 10). + +--- + +## 5. Unit Economics + +Three primary PromQL formulas for cost-to-business-metric conversion: + +**Per-request cost (rolling 1-hour window):** + +```promql +sum(rate(opencost_workload_cost_total{workload="checkout-api"}[1h])) * 3600 + / +sum(rate(http_requests_total{job="checkout-api"}[1h])) +``` + +**Per-tenant cost (daily, aggregated):** + +```promql +sum by (tenant_id) ( + increase(opencost_workload_cost_total[24h]) + * on(namespace) group_left(tenant_id) + kube_namespace_labels +) +``` + +> **Dependency**: the query joins against `kube_namespace_labels` from kube-state-metrics. `kube-state-metrics` by default does NOT expose custom namespace labels (only a small allowlist). To surface `tenant_id`, run kube-state-metrics with `--metric-labels-allowlist=namespaces=[tenant_id,tenant_tier]` (or via Helm `metricLabelsAllowlist`). Without this flag, the query silently returns empty — a common production pitfall. + +**Per-namespace cost (current rate, USD/hr):** + +```promql +sum by (namespace) ( + opencost_cpu_cost + opencost_ram_cost + opencost_storage_cost + opencost_network_cost +) +``` + +**SLO + cost trade-off (gold tier vs silver tier delta):** + +```promql +# Gold tier: high-replica, low-latency p99 +sum(opencost_workload_cost_total{workload=~".*-gold"}) by (tenant_id) + / +sum(opencost_workload_cost_total{workload=~".*-silver"}) by (tenant_id) +``` + +This ratio surfaces the cost multiplier of tiered SLO guarantees and feeds tier-pricing decisions. + +--- + +## 6. LLM / AI Cost Observability + +Full LLM observability is out of scope for this skill. The tools for that domain are Langfuse, Arize Phoenix, LangSmith, and Braintrust (see `SKILL.md §When NOT to use`). This section covers only the intersection where LLM cost surfaces as a span attribute in the OTel pipeline. + +**Token-based pricing attributes** (OTel `gen_ai.*` semconv, currently Experimental tier per `../standards.md §Semconv Stability`): + +| Span attribute | Description | +|----------------|-------------| +| `gen_ai.usage.input_tokens` | Prompt tokens consumed | +| `gen_ai.usage.output_tokens` | Completion tokens generated | +| `gen_ai.cost.total_usd` | Computed cost in USD for the span (vendor-specific extension) | + +**Threshold-based sampling rule:** if `gen_ai.cost.total_usd > 0.50`, always retain the span regardless of sampling rate. This prevents silent budget overruns from being dropped before they reach the backend. Cross-ref `../transport/sampling-recipes.md §Cost-aware sampling` for the tail-sampler rule configuration. + +Note: `gen_ai.*` semconv is Experimental as of semconv 1.27.0. Do not use these attributes as inputs to production SLOs until the group reaches Stable or RC. + +--- + +## 7. Cross-Cloud Cost + +Each cloud exposes billing data through a distinct API and export format. FOCUS (FinOps Open Cost and Usage Specification) is the data-format unification effort. + +| Cloud | Billing data source | Export format | Ingestion path | +|-------|--------------------|----|------| +| AWS | Cost & Usage Report (CUR) | Parquet / CSV → S3 | Athena query → OpenCost price configmap | +| GCP | Billing Export | BigQuery table | BigQuery export job → OpenCost | +| Azure | Cost Management Export | CSV → Blob Storage | Storage export → OpenCost | +| On-prem | Manual price configmap | Custom `$/CPU-hr`, `$/GB-hr` | OpenCost configmap | + +**FOCUS Spec** (<https://github.com/FinOps-Open-Cost-and-Usage-Spec/FOCUS_Spec>) provides a vendor-neutral column schema (`BilledCost`, `ResourceId`, `ServiceName`) for cross-cloud cost joins. When FOCUS exports are available from a cloud provider, prefer them over native formats. For the OSS multi-cluster path, run one OpenCost instance per cluster and aggregate in Thanos or Mimir by namespace and workload labels. + +--- + +## 8. Retention Policy + +| Data type | Retention | Rationale | +|-----------|-----------|-----------| +| Raw cloud billing data (CUR / GCP export) | 2 years | Financial audit compliance (SOX, ISO 27001) | +| OpenCost Prometheus metrics (hourly resolution) | 2 years | Chargeback evidence per tenant | +| Aggregated per-tenant / per-feature cost | Long-term (cold storage) | Historical unit economics trending | +| Per-request span cost attributes (`gen_ai.cost.total_usd`) | 30–90 days | Operational debugging window | + +Cross-ref `../meta-observability.md §Retention Matrix` for the full retention policy table covering all seven signals. + +--- + +## 9. Privacy & Access Control + +Cost per tenant reveals revenue tier, contract value, and resource consumption patterns. Restrict access by role: + +**RBAC separation:** + +| Role | Dashboard access | Data access | +|------|-----------------|-------------| +| Finance | Full cost by tenant, contract | Billing export (read-only) | +| Platform Engineering | Cost by namespace, workload, cluster | OpenCost metrics (read-only) | +| Application Engineering | Cost by their own service / namespace | Filtered by namespace label | +| Tenant admin (self-serve) | Their own tenant cost only | Filtered by `tenant_id` claim | + +Cross-ref `privacy.md §Backend RBAC` for the Grafana RBAC configuration pattern and OPA policy rules for dashboard-level tenant isolation. + +--- + +## 10. Anti-Cardinality Rules for Cost Labels + +Follow the same principles as metric labels (cross-ref `../meta-observability.md §Cardinality Guardrails`). Cost workloads add two additional risk surfaces: per-request LLM spans and dynamic tenant growth. + +| Rule | Applies to | Rationale | +|------|-----------|-----------| +| `tenant.id` allowed as metric label with top-N cap (≤ 1000) | OpenCost metrics | Bounded tenant count; use `"other"` bucket for overflow | +| `namespace` and `workload` always allowed | OpenCost metrics | Bounded by cluster size | +| `gen_ai.cost.total_usd` as span attribute only — never as metric label | LLM spans | One series per request = cardinality explosion | +| `feature.name` allowed as pod label; allowed as metric label with cap | OpenCost workload attribution | Feature set is bounded; gate new features through label allowlist | +| `request.id` never as metric label for cost | Any | Unbounded cardinality; use trace attribute only | + +Per-request cost labels are safe only as trace span attributes, where cardinality is handled by the trace backend (not a TSDB). Aggregate by `tenant.id`, `namespace`, and `workload` for metric surfaces. + +--- + +## 11. Vendors + +As of 2026-Q2. Verify current status at <https://landscape.cncf.io>. + +Cross-ref `../vendor-categories.md §FinOps / Cost` for full selection guidance. + +| Vendor | Type | Notes | +|--------|------|-------| +| OpenCost | OSS, CNCF Incubating | Kubernetes-native; Prometheus exposition; community-maintained | +| Kubecost | Commercial (IBM) | OpenCost OSS core + commercial UI, multi-cluster, recommendations | +| CloudZero | Commercial SaaS | Engineering-focused cost attribution; no Kubernetes agent required | +| Vantage | Commercial SaaS | Cross-cloud cost reporting; FOCUS spec early adopter | + +--- + +## 12. Matrix Cells — Cost Column + +Quick navigation for cost-column cells in `../matrix.md`: + +| Layer | Boundary | Status | Detail | +|-------|----------|--------|--------| +| L3-network | multi-tenant | ⚠️ | Egress byte attribution by tenant VPC; cost proxy, not unit economics | +| L3-network | cross-application | ⚠️ | Cross-VPC egress cost attribution; unreliable without flow tagging | +| L4-transport | multi-tenant | ⚠️ | L4 byte volume per tenant as cost proxy; rolls up into compute | +| L4-transport | cross-application | ⚠️ | Cross-application byte volume; correlate with L7 for accuracy | +| mesh | multi-tenant | ⚠️ | Request-level cost attribution by tenant via mesh telemetry; feeds OpenCost | +| mesh | cross-application | ✅ | Per-service byte/request counts from Envoy; unit cost cross-service | +| mesh | release | ⚠️ | Canary cost delta via per-version request counts in mesh telemetry | +| L7-application | multi-tenant | ✅ | Primary use case: per-tenant OpenCost unit economics with `tenant.id` pod label | +| L7-application | cross-application | ✅ | Per-service unit cost model; `gen_ai.cost.total_usd` span attribute for LLM | +| L7-application | slo | ✅ | Gold-tier vs silver-tier cost delta; cost trade-off for SLO tier selection | +| L7-application | release | ✅ | Canary cost delta: compare per-request cost across canary vs stable version | + +L3 and L4 cost cells are ⚠️ because they produce a cost proxy (egress bytes) that informs FinOps egress billing but is insufficient for full unit-economics modeling. See `../matrix.md §C6` for the detailed caveat. + +--- + +## 13. Anti-Patterns + +Candidates for `../anti-patterns.md §Section B Cardinality & Cost`: + +| Anti-pattern | Impact | Correction | +|-------------|--------|------------| +| Cost label at per-request metric granularity | Cardinality explosion in TSDB; OOM on ingestor | Use `gen_ai.cost.total_usd` as span attribute; aggregate cost metrics by tenant/namespace/workload | +| No `tenant.id` pod label | Cannot attribute cost to tenants; chargeback impossible | Apply `tenant.id` at deploy time via Helm values or admission webhook | +| Cost dashboard with public access | Reveals tenant revenue tier and contract value | Apply Grafana RBAC; finance and engineering views are separate; no public embedding | +| Ignoring egress cost in FinOps | Often the largest surprise cost in multi-cloud or CDN-heavy architectures | Include `opencost_network_cost` in all cost dashboards; set egress budget alerts | +| LLM spans not tail-sampled on cost threshold | Silent budget blowup: expensive spans are dropped before alerting | Configure tail-sampler rule: if `gen_ai.cost.total_usd > 0.50`, always retain | +| One metric name per tenant for cost | Bypasses TSDB cardinality controls; cannot be aggregated | Use `opencost_workload_cost_total{tenant_id="acme"}` with top-N cap | + +--- + +## Cross-References + +| Topic | File | +|-------|------| +| Tenant cost chargeback and routing | `../boundaries/multi-tenant.md §Cost Attribution` | +| Canary cost delta and A/B comparison | `../boundaries/release.md` | +| Tail-sampler cost-aware rules | `../transport/sampling-recipes.md §Cost-aware sampling` | +| Cardinality guardrails and top-N cap | `../meta-observability.md §Cardinality Guardrails` | +| Retention matrix (all signals) | `../meta-observability.md §Retention Matrix` | +| Cost dashboard RBAC | `signals/privacy.md §Backend RBAC` | +| Vendor selection (FinOps category) | `../vendor-categories.md §FinOps / Cost` | +| Full 112-cell matrix with cost column | `../matrix.md` | +| OpenCost metric surface (metrics signal) | `signals/metrics.md §8 OpenCost Metric Surface` | +| LLM / gen_ai semconv stability | `../standards.md §3 OTel Semconv Stability Tiers` | diff --git a/.agents/skills/oma-observability/resources/signals/logs.md b/.agents/skills/oma-observability/resources/signals/logs.md new file mode 100644 index 0000000..4294d77 --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/logs.md @@ -0,0 +1,263 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +notes: + - "Fluent Bit: v3.x (CNCF Graduated); OTel Collector: v0.100+" +--- + +# Logs Signal Reference + +## 1. Scope + +Logs are the "L" in MELT+P. Covers: OTel LogRecord data model, structured logging, events-as-logs, systemd journal, and pipeline options. + +**Normative base:** `../standards.md`. **Deprecated:** Fluentd (CNCF 2025-10) — use Fluent Bit or OTel Collector. + +--- + +## 2. OTel LogRecord Data Model + +Source: <https://opentelemetry.io/docs/specs/otel/logs/data-model/> + +| Field | Type | Description | +|---|---|---| +| `Timestamp` | uint64 (ns) | When the event occurred; 0 if unknown | +| `ObservedTimestamp` | uint64 (ns) | When the collector observed the record; always set | +| `SeverityText` | string | e.g. `"ERROR"` — original string from source | +| `SeverityNumber` | int (1–24) | Normalized OTel severity | +| `Body` | any | String, map, or primitive | +| `Attributes` | map[string]any | Semconv-stable keys preferred | +| `TraceId` | 16-byte hex | W3C trace ID; empty if not in a trace | +| `SpanId` | 8-byte hex | W3C span ID; empty if not in a span | +| `Resource` | map[string]any | `service.name`, `host.name`, etc. | + +**Severity scale:** TRACE 1–4 · DEBUG 5–8 · INFO 9–12 · WARN 13–16 · ERROR 17–20 · FATAL 21–24 + +**Export:** OTLP gRPC `:4317` or HTTP `:4318/v1/logs`. Configure the SDK's `LogRecordExporter` to point at the OTel Collector. + +--- + +## 3. Structured Logging + +JSON over text: queryable, typed, machine-readable. No grok patterns. + +```json +{ + "timestamp": "2026-04-21T09:15:32.847Z", + "level": "ERROR", + "message": "payment gateway timeout after 3 retries", + "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736", + "span_id": "00f067aa0ba902b7", + "service.name": "checkout-api", + "service.version": "1.4.2", + "deployment.environment": "production", + "error.type": "GatewayTimeoutError", + "retry.count": 3 +} +``` + +**Required fields on every log record:** + +| Field | Format | Why | +|---|---|---| +| `timestamp` | ISO 8601 UTC (`Z`) | Clock-ordered queries; cross-ref `../standards.md §Clock Discipline` | +| `level` | OTel SeverityText | Filtering and sampling | +| `message` | human-readable string | On-call readability | +| `trace_id` + `span_id` | 32/16-char hex | Join with traces — critical for incident forensics | +| `service.name`, `service.version` | semconv stable | Release comparison | +| `deployment.environment` | `production` / `staging` | Noise isolation | + +**Metric correlation:** use `service.name`, `cloud.region`, `k8s.namespace.name` as shared labels across logs and metrics. Cross-ref `../incident-forensics.md §MRA` for the full mandatory resource attribute list. + +--- + +## 4. Events as LogRecords + +OTel spec v1+ folded events into LogRecord: set `event.name` + `event.*` attributes. Source: <https://opentelemetry.io/docs/specs/otel/logs/event-api/> + +```json +{ + "timestamp": "2026-04-21T09:15:33.001Z", + "level": "INFO", + "message": "order placed", + "event.name": "order.placed", + "order.id": "ord_8f3k2", + "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736", + "span_id": "00f067aa0ba902b7", + "service.name": "order-service", + "deployment.environment": "production" +} +``` + +Use cases: `feature_flag.evaluated`, `deployment.completed`, `order.placed`, `user.signup`. + +**Distinction from metrics:** events are discrete occurrences; metrics are aggregated rates. +**Distinction from logs:** events have a declared `event.name` and attribute contract; logs are often free-form. + +--- + +## 5. Systemd Journal Integration + +On most Linux cloud VMs, systemd-journald is the primary log sink. Map via `journaldreceiver` (stable). + +```yaml +# otelcol-config.yaml +receivers: + journald: + directory: /var/log/journal + units: [kubelet.service, containerd.service] + priority: info +processors: + resource: + attributes: + - { key: host.name, from_attribute: _HOSTNAME, action: upsert } + - { key: service.name, from_attribute: _SYSTEMD_UNIT, action: upsert } +exporters: + otlp: + endpoint: "otel-gateway.observability.svc:4317" +service: + pipelines: + logs: + receivers: [journald] + processors: [resource] + exporters: [otlp] +``` + +Source: <https://github.com/open-telemetry/opentelemetry-collector-contrib/receiver/journaldreceiver> + +**journald field mapping:** `MESSAGE` → `Body` · `PRIORITY` → `SeverityNumber` · `_HOSTNAME` → `Resource["host.name"]` · `_SYSTEMD_UNIT` → `Resource["service.name"]` · `_PID` → `Attributes["process.pid"]` + +**Kubernetes log sources:** +- kubelet → container stdout/stderr → runtime log files → `filelogreceiver` (`/var/log/pods/*/*/*.log`) +- Kubernetes events → `k8seventsreceiver` (OTel Collector contrib) + +--- + +## 6. Collector and Agent Options + +Cross-ref `../vendor-categories.md §Log Pipeline Collection` (category h). + +| Tool | Runtime | RAM | CNCF | Notes | +|---|---|---|---|---| +| **Fluent Bit** | C/Rust | 5–15 MB | Graduated | Edge DaemonSet; native OTLP output | +| **OTel Collector** | Go | 30–100 MB | Incubating | Unified MELT; gateway aggregation | +| **Vector** | Rust | 20–60 MB | Datadog OSS | Log + metric pipeline with transforms | +| **Cribl Stream** | — | 100+ MB | Commercial | Advanced routing | +| ~~Fluentd~~ | Ruby | 100+ MB | **Deprecated** | CNCF 2025-10 | + +**2026 best practice:** Fluent Bit DaemonSet (edge) → OTLP → OTel Collector gateway (enrichment, PII redaction, routing) → backends. + +--- + +## 7. Trace ID Injection Rules + +**EVERY log line from an instrumented service MUST carry `trace_id` + `span_id`.** + +Without them, log-trace join fails during incident forensics, adding 15–30 min to MTTR. + +**Injection mechanisms:** + +| Language | Library | Method | +|---|---|---| +| Python | `structlog` | Context processor extracting OTel span context | +| Java | Logback / Log4j2 | `OpenTelemetryAppender` MDC integration | +| Node.js | `pino` | `pino-opentelemetry-transport` | +| Go | `zap` / `slog` | Manual `trace.SpanFromContext` extraction | + +**Verification:** assert in integration tests that the `trace_id` in a log record matches bytes 1–16 of the inbound `traceparent` header. Cross-ref `../standards.md §W3C Trace Context`. + +--- + +## 8. Log Severity and Sampling + +| Severity | Production policy | +|---|---| +| TRACE / DEBUG | 1% sample | +| INFO | 10–20% for high-volume services | +| WARN / ERROR / FATAL | 100% retained | + +Apply sampling at the collection tier (Fluent Bit throttle filter or OTel Collector `probabilistic_sampler`), not at the application tier. + +**Tail sampling for logs:** retain 100% of records whose `trace_id` is associated with an error trace. Cross-ref `../transport/sampling-recipes.md` for the OTel Collector tail-sampling processor config. + +--- + +## 9. Log Retention and Compliance + +| Category | Retention | Storage | +|---|---|---| +| Operational (INFO/DEBUG) | 7–30 days | Hot | +| WARN / ERROR | 90 days | Hot or warm | +| Audit logs | 7 years | WORM object storage | + +**PII redaction** MUST occur at ingestion (Fluent Bit or OTel Collector `redaction` processor), not at storage time. Cross-ref `privacy.md §PII redaction pipeline`. + +Cross-ref `../meta-observability.md §Retention Matrix` for unified policy across all signals. + +--- + +## 10. OS-Level Log Sources + +| Source | Collection | Notes | +|---|---|---| +| systemd-journald | `journaldreceiver` | Primary on Linux; all systemd units | +| syslog (legacy) | `syslogreceiver` | Facilities: kern, daemon, auth | +| kernel dmesg | journald (forwarded automatically) | OOM kills, NIC errors | +| Linux auditd | `filelogreceiver` on `/var/log/audit/audit.log` | Cross-ref `audit.md §auditd` | + +--- + +## 11. Backends + +Cross-ref `../vendor-categories.md` for category (e) SIEM / Enterprise Logs and (a) OSS Full-Stack. + +| Backend | Query model | Notes | +|---|---|---| +| **Loki** (Grafana Labs) | LogQL; label-indexed | Chunks in object storage; cloud-native OSS | +| **Elasticsearch** | Lucene / EQL | Full-text search; large existing footprints | +| **OpenSearch** (AWS fork) | Lucene / SQL | No Elastic licensing | +| **ClickHouse** | SQL; columnar | Rising default for high-volume structured log analytics; used by SigNoz | + +--- + +## 12. Matrix Coverage: Logs Column + +Cells from `../matrix.md` owned by this file: + +| Layer | Boundary | Status | Artifact | +|---|---|---|---| +| L3-network | multi-tenant | ✅ | VPC flow log stream tagged by tenant CIDR | +| L4-transport | multi-tenant | ✅ | TCP connection events per tenant socket namespace | +| mesh | multi-tenant | ✅ | Envoy access logs with baggage-derived tenant tag | +| mesh | cross-application | ✅ | Envoy access logs; `trace_id` correlation | +| mesh | slo | ⚠️ | Burn-rate event source; metric-based SLI preferred | +| mesh | release | ✅ | Deployment event with `service.version` tag | +| L7-application | multi-tenant | ✅ | `tenant.id` on every record; OTel Collector `routing_connector` | +| L7-application | cross-application | ✅ | `trace_id` + `span_id` enables log-trace join across services | +| L7-application | slo | ⚠️ | Log-based SLI valid fallback; metric-based preferred | +| L7-application | release | ✅ | Deployment SHA, version, rollout strategy as structured event | + +--- + +## 13. Anti-patterns + +Append candidates for `../anti-patterns.md §Logs`: + +| ID | Anti-pattern | Fix | +|---|---|---| +| A-L1 | Free-form log text without JSON | Adopt structured JSON; no grok | +| A-L2 | `trace_id` missing in logs | Inject via SDK hook (Section 7); enforce in CI | +| A-L3 | `user.email` in log body | PII violation — redact at collection tier; cross-ref `privacy.md` | +| A-L4 | Unique message IDs as log labels | Cardinality explosion in Loki — use attributes, not labels | +| A-L5 | Fluentd as new deployment in 2026+ | Deprecated — use Fluent Bit or OTel Collector | + +--- + +## 14. References + +1. OTel log data model — <https://opentelemetry.io/docs/specs/otel/logs/data-model/> +2. OTel event API — <https://opentelemetry.io/docs/specs/otel/logs/event-api/> +3. journaldreceiver — <https://github.com/open-telemetry/opentelemetry-collector-contrib/receiver/journaldreceiver> +4. CNCF Fluentd migration guide — <https://cncf.io/blog/2025/10/01/fluentd-to-fluent-bit-migration-guide> +5. Fluent Bit docs — <https://docs.fluentbit.io> +6. `../standards.md` · `../matrix.md` · `../incident-forensics.md §MRA` · `../meta-observability.md §Retention Matrix` · `../transport/sampling-recipes.md` · `../vendor-categories.md` diff --git a/.agents/skills/oma-observability/resources/signals/metrics.md b/.agents/skills/oma-observability/resources/signals/metrics.md new file mode 100644 index 0000000..47c5232 --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/metrics.md @@ -0,0 +1,474 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +notes: + - "OpenMetrics: IETF RFC 9416 (2023-08)" +--- + +# Metrics Signal + +## 1. Scope + +Metrics are the "M" in MELT+P (Metrics, Events, Logs, Traces + Profiles). + +This file covers: +- OTel metric instrument types and their sync/async variants +- Prometheus exposition format and OpenMetrics (RFC 9416) formalization +- SLI computation patterns (Golden Signals, RED, USE) and healthcheck integration +- `hostmetrics` receiver for host-level collection +- Kubernetes metric sources +- Application metric integration via OTel SDK +- OpenCost metric surface for cost attribution +- Cardinality budget enforcement +- Long-term storage backend options + +Out of scope: +- Full SLO framework, error budget policy, and burn-rate alert rules: see `../boundaries/slo.md` +- Full FinOps unit economics and cost allocation model: see `cost.md` +- Cardinality guardrail detail and pipeline remediation: see `../meta-observability.md §Section C` + +--- + +## 2. OTel Metric Instrument Types (Stable) + +Source: <https://opentelemetry.io/docs/specs/otel/metrics/api/> + +Five instrument types are stable in the OTel API/SDK 1.x. Choose based on measurement semantics. + +### 2.1 Counter + +Monotonic, cumulative-only. Value never decreases. Measures cumulative totals. + +``` +Use for: requests_total, errors_total, bytes_sent_total +Never use for: values that can decrease (use UpDownCounter) +``` + +Sync variant: `counter.Add(ctx, delta, attrs...)` — caller increments explicitly. +Async variant: `ObservableCounter` — callback provides absolute cumulative value. + +### 2.2 UpDownCounter + +Monotonic: no. Can increment or decrement. Measures quantities that fluctuate. + +``` +Use for: queue_length, active_connections, in_flight_requests +Never use for: strictly increasing values (use Counter) +``` + +Sync variant: `upDownCounter.Add(ctx, delta, attrs...)` where delta is positive or negative. +Async variant: `ObservableUpDownCounter` — callback provides current value. + +### 2.3 Gauge + +Point-in-time observation of a current value. No accumulation. + +``` +Use for: cpu_usage, memory_used_bytes, temperature_celsius, cache_hit_ratio +``` + +Sync variant: `gauge.Record(ctx, value, attrs...)`. +Async variant: `ObservableGauge` — preferred for values read from an external source (OS, hardware sensor). + +### 2.4 Histogram + +Distribution of values. Captures bucket counts, sum, and count. Supports quantile computation via `histogram_quantile` at query time. + +``` +Use for: request_duration_seconds, response_size_bytes, db_query_duration_seconds +Prefer over Summary for all new instrumentation +``` + +Sync variant only: `histogram.Record(ctx, value, attrs...)`. +Configure explicit bucket boundaries per instrument to control cardinality. Default OTel SDK buckets: `[0, 5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000]` (milliseconds — override for seconds-based metrics). + +### 2.5 Summary (not recommended for new code) + +Client-side quantile computation. Produces quantile values at SDK level. + +Limitations: +- Quantiles are computed per-process; they cannot be aggregated across replicas. +- `p99` from three replicas cannot be merged into a fleet-level `p99`. +- Use Histogram + `histogram_quantile` instead for any distributed system. + +When to keep existing Summary metrics: only if the downstream consumer cannot be migrated and the measurement is single-process. + +### 2.6 Temporality: Delta vs Cumulative + +OTel SDK default: `delta` (each export contains only the measurements since the last collection). +Prometheus wire format requires: `cumulative` (monotonic, ever-increasing values). + +Set the preference via environment variable: + +``` +OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative +``` + +Or configure per-instrument in the OTLP exporter. The `prometheusreceiver` in the OTel Collector handles delta-to-cumulative conversion if the SDK exports delta and Prometheus is the backend. + +--- + +## 3. Prometheus Exposition Format + +Source: <https://prometheus.io/docs/instrumenting/exposition_formats/> +OpenMetrics: <https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md> (IETF RFC 9416) + +### 3.1 Text Format Structure + +``` +# HELP http_requests_total Total HTTP requests received +# TYPE http_requests_total counter +http_requests_total{method="GET",status="200",route="/api/v1/users"} 1234 1714521600000 +http_requests_total{method="POST",status="422",route="/api/v1/users"} 7 1714521600000 +``` + +Rules: +- `# HELP` line: one per metric family, human-readable description. +- `# TYPE` line: one per metric family, declares instrument type. +- Metric line: `name{label="value",...} value [timestamp_ms]`. Timestamp is optional. +- Blank line separates metric families. + +### 3.2 Naming Convention + +Pattern: `{component}_{operation}_{unit}_{suffix}` + +| Suffix | When to use | Example | +|--------|-------------|---------| +| `_total` | Counters (monotonic) | `http_requests_total` | +| `_seconds` | Duration (always seconds, not ms) | `http_request_duration_seconds` | +| `_bytes` | Size in bytes | `response_size_bytes` | +| `_ratio` | Fraction 0.0–1.0 | `cache_hit_ratio` | +| `_bucket` | Histogram bucket (auto-appended) | `http_request_duration_seconds_bucket` | +| `_count` | Histogram observation count (auto) | `http_request_duration_seconds_count` | +| `_sum` | Histogram observation sum (auto) | `http_request_duration_seconds_sum` | + +### 3.3 Label Rules + +Low-cardinality labels only. Cross-ref `../meta-observability.md §Section C Cardinality Guardrails`. + +Safe label examples: `method`, `status_code`, `route` (normalized), `service`, `region`, `env`. +Forbidden label examples: `user.id`, `request.id`, `trace.id`, `http.url` (raw), `error.message`. + +### 3.4 OpenMetrics Snippet (RFC 9416) + +OpenMetrics is the IETF formalization of Prometheus text format. Key difference: `# EOF` terminator is required, and `_total` suffix is mandatory for counters in the `# TYPE counter` family. + +``` +# HELP http_requests_total Total HTTP requests received. +# TYPE http_requests_total counter +http_requests_total_total{method="GET",status="200"} 1234.0 1714521600.000 +# EOF +``` + +The OTel Prometheus exporter supports both Prometheus text format and OpenMetrics format; select via `Accept: application/openmetrics-text` request header. + +--- + +## 4. SLI / Healthcheck Basics + +### 4.1 Google SRE Golden Signals + +Four signals for monitoring any request-serving system: + +| Signal | Description | Primary instrument | +|--------|-------------|--------------------| +| Latency | Time to serve a request (good vs error latency tracked separately) | Histogram | +| Traffic | Volume of demand (requests/sec, queries/sec) | Counter rate | +| Errors | Rate of failed requests (explicit 5xx, implicit latency budget exceeded) | Counter rate | +| Saturation | How "full" the service is; utilization as fraction of capacity | Gauge or Histogram | + +### 4.2 RED Method (request-serving services) + +| Signal | PromQL sketch | +|--------|---------------| +| Rate | `sum(rate(http_requests_total[5m])) by (service)` | +| Errors | `sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)` | +| Duration | `histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))` | + +### 4.3 USE Method (resource-level — CPU, memory, disk) + +| Signal | PromQL sketch | +|--------|---------------| +| Utilization | `avg(system.cpu.utilization) by (host)` | +| Saturation | `system.cpu.load_average.1m / count(system.cpu.utilization) by (host)` | +| Errors | `rate(system.network.errors{direction="transmit"}[5m])` | + +### 4.4 SLI Examples (PromQL) + +Availability SLI (fraction of successful requests over the measurement window): + +```promql +sum(rate(http_requests_total{status=~"2..|3.."}[5m])) + / +sum(rate(http_requests_total[5m])) +``` + +Latency SLI (p99 from histogram, 5-minute rolling window): + +```promql +histogram_quantile( + 0.99, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service) +) +``` + +Cross-ref `../boundaries/slo.md` for SLO target definitions, error-budget burn-rate computation, and multi-window alert rules. + +### 4.5 Healthcheck Endpoints and Metrics + +Kubernetes readiness and liveness probes (`/readyz`, `/healthz`) are HTTP checks — they are not metrics themselves. However, the underlying health state MUST be exposed as a metric so that dashboards and SLO computation can consume it continuously: + +```promql +# Health metric emitted by application (1 = healthy, 0 = unhealthy) +up{job="my-service"} # Prometheus scrape synthetic metric + +# Custom health gauge from OTel SDK +service_health_status{service="checkout", check="database"} 1 +``` + +Anti-pattern: a healthcheck that returns 200 without reflecting actual dependency health. If the database is down, the readiness probe MUST return a non-2xx status, and the `service_health_status` gauge MUST reflect 0. A probe that always returns 200 is not tied to an underlying SLI metric and produces false-positive availability signals. + +--- + +## 5. hostmetrics Receiver (OTel Collector) + +Source: <https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver> + +The `hostmetrics` receiver collects host-level OS metrics without requiring a separate agent. It runs inside the OTel Collector process and reads from OS interfaces. + +### 5.1 Deployment + +Deploy as DaemonSet (one Collector per node) to collect per-node metrics. Cross-ref `../transport/collector-topology.md` for the two-tier topology (DaemonSet agent + gateway). + +```yaml +receivers: + hostmetrics: + collection_interval: 30s + root_path: /hostfs # required when running in container; mount host /proc and /sys + scrapers: + cpu: {} + memory: {} + disk: {} + filesystem: {} + network: {} + load: {} + paging: {} + process: + mute_process_name_error: true +``` + +### 5.2 Key Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `system.cpu.time` | Counter | CPU time by state (user, system, idle, iowait, irq, softirq, steal, nice) | +| `system.cpu.utilization` | Gauge | CPU utilization 0.0–1.0 by state | +| `system.memory.usage` | Gauge | Memory bytes by state (used, free, cached, buffered) | +| `system.memory.utilization` | Gauge | Memory utilization 0.0–1.0 by state | +| `system.disk.io` | Counter | Disk bytes read/written by device | +| `system.disk.io_time` | Counter | Time disk is active by device (seconds) | +| `system.filesystem.usage` | Gauge | Filesystem bytes by mount point and state | +| `system.network.io` | Counter | Network bytes transmitted/received by interface | +| `system.network.errors` | Counter | Network errors by interface and direction | +| `system.load.average.1m` | Gauge | 1-minute load average | +| `system.load.average.5m` | Gauge | 5-minute load average | +| `system.load.average.15m` | Gauge | 15-minute load average | +| `system.paging.usage` | Gauge | Swap/paging space by state | +| `process.cpu.time` | Counter | Per-process CPU time by state | +| `process.memory.physical_usage` | Gauge | Per-process RSS in bytes | + +### 5.3 Alternative: Prometheus Node Exporter + +For environments where the OTel Collector cannot run on the host, Prometheus Node Exporter is the mature alternative (CNCF Graduated). Scrape with `prometheusreceiver` and convert to OTel metrics via the Collector pipeline. Metric names differ from OTel semconv (e.g., `node_cpu_seconds_total` vs `system.cpu.time`). Prefer `hostmetrics` receiver for new deployments. + +--- + +## 6. Kubernetes Metric Sources + +Three complementary sources cover cluster, node, pod, and container metrics: + +| Source | Scope | Collector component | +|--------|-------|---------------------| +| `kubeletstats` receiver | Pod and container metrics from kubelet API | `kubeletstatsreceiver` | +| `k8scluster` receiver | Cluster-level state (node conditions, pod phases, deployment replicas) | `k8sclusterreceiver` | +| Prometheus operator + ServiceMonitor CRD | Scrape application Prometheus endpoints declaratively | `prometheusreceiver` | + +Cross-ref `../transport/collector-topology.md §Component preferences` for the recommended configuration of each source in a DaemonSet + gateway topology. + +--- + +## 7. Application Metric Integration + +### 7.1 Auto-instrumentation + +OTel SDK auto-instrumentation provides RED metrics out-of-the-box for common frameworks: +- HTTP servers: `http.server.request.duration` (Histogram), `http.server.active_requests` (UpDownCounter) +- HTTP clients: `http.client.request.duration` (Histogram) +- gRPC: `rpc.server.duration` (Histogram), `rpc.client.duration` (Histogram) +- Database clients: `db.client.operation.duration` (Histogram) + +These use OTel semconv stable attributes (`http.*`, `rpc.*`, `db.*` core groups). Verify your framework's OTel instrumentation library supports the attribute groups before relying on them in SLO computation. + +### 7.2 Custom Metrics via Meter API + +```python +# Python SDK example — creating a histogram for a custom operation +from opentelemetry import metrics + +meter = metrics.get_meter("com.example.checkout", version="1.0.0") + +payment_duration = meter.create_histogram( + name="payment.processing.duration", + unit="s", + description="Time to process a payment request", +) + +# Recording a measurement +payment_duration.record(0.42, {"payment.method": "card", "currency": "USD"}) +``` + +### 7.3 Attribute Allow-listing for Cardinality Control + +Explicitly allow-list attributes on each instrument view to prevent label explosion. Cross-ref `../meta-observability.md §Section C`: + +```python +from opentelemetry.sdk.metrics.view import View + +view = View( + instrument_name="payment.processing.duration", + attribute_keys={"payment.method", "currency"}, # only these two labels emitted +) +``` + +--- + +## 8. OpenCost Metric Surface + +Source: <https://www.opencost.io/docs/configuration/prometheus> + +OpenCost (CNCF Incubating) exposes cost attribution in Prometheus format at its `/metrics` endpoint. Scrape via `prometheusreceiver` to bring cost into the same TSDB as latency and error metrics, making cost a dimension alongside performance. + +### 8.1 Key OpenCost Metrics + +| Metric | Type | Unit | Description | +|--------|------|------|-------------| +| `opencost_namespace_cost_total` | Counter | USD | Cumulative cost by Kubernetes namespace | +| `opencost_workload_cost_total` | Counter | USD | Cumulative cost by workload (deployment/statefulset) | +| `opencost_cpu_cost` | Gauge | USD/hr | Current CPU cost attribution | +| `opencost_ram_cost` | Gauge | USD/hr | Current memory cost attribution | +| `opencost_network_cost` | Gauge | USD/hr | Current network egress cost attribution | +| `opencost_storage_cost` | Gauge | USD/hr | Current persistent storage cost attribution | + +### 8.2 Scrape Configuration + +```yaml +receivers: + prometheus: + config: + scrape_configs: + - job_name: opencost + scrape_interval: 60s + static_configs: + - targets: ["opencost.opencost.svc.cluster.local:9003"] +``` + +Cross-ref `cost.md` for the full FinOps unit-economics model, per-tenant cost attribution, and chargeback/showback patterns. + +--- + +## 9. Cardinality Budget + +Cross-ref `../meta-observability.md §Section C Cardinality Guardrails` for full detail and remediation procedures. + +### 9.1 Per-Service Budget + +Set a hard budget per service, enforced via alert at 80% of budget: + +```promql +# Alert: service approaching 5000-series budget +count({job="my-service"}) > 4000 +``` + +### 9.2 Forbidden Labels (Never use as metric labels) + +| Label | Reason | +|-------|--------| +| `user.id` | Unbounded; one series per user | +| `request.id` | One series per request — instant explosion | +| `trace.id` | One series per trace | +| `http.url` (raw) | Query strings are unbounded | +| `error.message` | Free-text; unbounded cardinality | + +### 9.3 Tenant Cap + +`tenant.id` label is allowed with a hard cap: top-N explicit tenants (e.g., 1000), all others mapped to label value `"other"`. Never create a new metric name per tenant — this bypasses TSDB cardinality controls entirely (anti-pattern listed in Section 12). + +--- + +## 10. Long-Term Storage Backends + +Prometheus-compatible TSDBs for metrics retention beyond 15 days: + +| Backend | CNCF Status | Key Characteristic | Typical Retention | +|---------|-------------|--------------------|-------------------| +| Prometheus | CNCF Graduated | Local storage, no HA by default | 15d (short-term) | +| Thanos | CNCF Graduated | Object storage long-term; Prometheus sidecar model | 1y+ | +| Cortex | CNCF Incubating | Multi-tenant Prometheus; horizontally scalable | 1y+ | +| Grafana Mimir | Not CNCF | Grafana Labs fork of Cortex; production-grade | 1y+ | +| VictoriaMetrics | Not CNCF | High-performance; efficient storage compression | 1y+ | + +Cross-ref `../vendor-categories.md §TSDB / Long-term Metrics` for selection guidance per workload size and multi-tenancy requirements. + +--- + +## 11. Matrix Coverage (metrics column) + +This table maps the metrics signal cells from `../matrix.md` for quick navigation. + +| Layer | Boundary | Coverage | Detail | +|-------|----------|----------|--------| +| L3-network | multi-tenant | ✅ | Per-tenant VPC flow byte/packet counters | +| L3-network | cross-application | ✅ | Inter-VPC/peering flow metrics | +| L4-transport | multi-tenant | ✅ | Per-tenant TCP retransmit rate, RTT histograms via eBPF | +| L4-transport | cross-application | ✅ | Cross-service TCP RTT and retransmit metrics via eBPF | +| mesh | multi-tenant | ✅ | Per-tenant RED from Envoy telemetry; `tenant.id` via baggage | +| mesh | cross-application | ✅ | Cross-service RED metrics at mesh proxy; topology mapping | +| mesh | slo | ✅ | Request rate and error rate as primary SLI sources | +| mesh | release | ✅ | Request split metrics for canary traffic (Flagger/Argo Rollouts) | +| L7-application | multi-tenant | ✅ | Per-tenant RED + custom business metrics with `tenant.id` | +| L7-application | cross-application | ✅ | Inter-service histograms with `service.name` + `peer.service` | +| L7-application | slo | ✅ | SLI metric computation; SLO targets in OpenSLO YAML | +| L7-application | release | ✅ | Release marker; `service.version` label for before/after delta | + +--- + +## 12. Anti-Patterns + +These are candidates for `../anti-patterns.md §Section B Cardinality & Cost`: + +| Anti-pattern | Description | Correction | +|--------------|-------------|------------| +| `user.id` as metric label | Creates one series per user; TSDB storage explosion | Use `user.tier` or aggregated cohort label | +| New metric name per tenant | `http_requests_total_tenant_acme` bypasses cardinality controls | Use `http_requests_total{tenant="acme"}` with top-N cap | +| Summary for cross-service aggregation | Client-side quantiles cannot be merged across replicas | Replace with Histogram + `histogram_quantile` at query time | +| Healthcheck not tied to underlying SLI | `/healthz` returns 200 regardless of dependency health | Emit `service_health_status{check="db"} 0` when unhealthy; wire into SLI | +| Raw `http.url` label | Query strings are unbounded; `?token=...` leaks secrets | Use `http.route` (normalized); apply `replace_pattern` in Collector | +| Histogram with default bucket boundaries | Default millisecond buckets are meaningless for second-scale requests | Set explicit bucket boundaries per instrument matching the expected value range | + +--- + +## Cross-References + +| Topic | File | +|-------|------| +| SLO policy, error budget, burn-rate alerts | `../boundaries/slo.md` | +| Full FinOps / unit economics | `cost.md` | +| Cardinality guardrails and remediation | `../meta-observability.md §Section C` | +| Collector two-tier topology (DaemonSet + gateway) | `../transport/collector-topology.md` | +| OTel spec and semconv stability tiers | `../standards.md` | +| Anti-patterns (Section B) | `../anti-patterns.md` | +| Vendor TSDB selection | `../vendor-categories.md §TSDB / Long-term Metrics` | +| Full matrix coverage (all signals) | `../matrix.md` | +| Logs signal | `logs.md` | +| Traces signal | `traces.md` | +| Profiles signal | `profiles.md` | diff --git a/.agents/skills/oma-observability/resources/signals/privacy.md b/.agents/skills/oma-observability/resources/signals/privacy.md new file mode 100644 index 0000000..9807a0e --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/privacy.md @@ -0,0 +1,351 @@ +--- +otel_semconv: "1.27.0 (2024-11); GDPR: 2016/679; PIPA: 2023 amendment" +--- + +# Privacy in Telemetry + +## 1. Scope and Distinction from Audit + +**Privacy** answers: *what personal data flows through telemetry, and how do we minimize, redact, or anonymize it?* + +**Audit** answers: *what actions occurred, who performed them, and can we prove it?* + +These are architecturally opposite concerns. See Design Decision D5 for the split rationale. + +| Dimension | Privacy | Audit | +|-----------|---------|-------| +| Goal | Collect less, protect subjects | Prove what happened | +| Mutability | Supports deletion (right to erasure) | Immutable, append-only | +| Retention | Short raw tier; long anonymized tier | Long-term (7–10 years typical) | +| Consumer | DPO, privacy engineers, legal | Security ops, compliance, auditors | +| Regulation driver | GDPR Art. 5, Art. 17; PIPA § 21 | SOX, PCI-DSS, ISO/IEC 27001 | + +**D5 Split rationale.** Merging privacy and audit into a single signal pipeline produces conflicting retention policies (erasure vs. immutability), access control models (broad deletion rights vs. tamper-evident storage), and regulatory obligations. Separating them lets each pipeline satisfy its own obligations without compromise. Cross-references between the two are explicit rather than implicit (see §9 and `audit.md`). + +Primary consumers of this file: Data Protection Officers (DPO), privacy engineers, legal counsel. + +--- + +## 2. Regulatory Drivers + +| Regulation | Jurisdiction | Key Articles / Principles | Max Penalty | +|------------|-------------|--------------------------|-------------| +| GDPR | EU / EEA | Art. 5(1)(c) minimization; Art. 5(1)(e) storage limitation; Art. 17 erasure; Art. 32 security | 4% global annual turnover | +| PIPA | South Korea | § 3 minimization; § 21 destruction; § 29 safety measures; separate sensitive data consent | KRW 300M or 3% revenue | +| CCPA / CPRA | California, US | Opt-out of sale; right to delete; sensitive PI category | USD 7,500 per intentional violation | +| HIPAA | United States | PHI definition; minimum necessary standard; breach notification | USD 100–50,000 per violation | +| PIPEDA | Canada | Fair information principles; breach of security safeguards | CAD 100,000 per violation | +| LGPD | Brazil | Art. 6 finality + necessity; Art. 18 erasure rights | 2% Brazil revenue, max BRL 50M | + +**Critical misclassification risk.** Claiming "anonymization" for data that is in fact pseudonymized (reversible with a key) constitutes a GDPR violation. Regulators have levied 4% penalties on this misclassification. See §3 for the definitive distinction. + +--- + +## 3. Anonymization vs Pseudonymization vs Tokenization + +The reversibility question is the only legally meaningful test: *Can you recover the original value with information you hold?* + +| Technique | Reversible? | GDPR Applies? | Typical Tooling | +|-----------|-------------|---------------|-----------------| +| Anonymization | No — irreversible | No | k-anonymity, differential privacy, aggregation, generalization | +| Pseudonymization | Yes — with key/salt | Yes | HMAC+salt, format-preserving encryption (FPE), AES-FF1 | +| Tokenization | Yes — with vault lookup | Yes | Payment token vaults, HSM-backed services | +| Hashing (no salt, low entropy) | Yes — brute-force feasible | Yes (treated as pseudonymization) | SHA-256 without salt on numeric IDs | + +**Decision rule.** Ask "Could we reverse this if compelled?" If no, it is anonymization and GDPR does not apply to the result. If yes (even theoretically), GDPR applies and you have pseudonymization obligations. + +**Key separation.** GDPR Art. 32 requires that pseudonymization keys be stored separately from the pseudonymized data, with independent access control and audit trail. + +--- + +## 4. Common PII in Telemetry + +| PII Category | Where It Appears in Telemetry | OTel Semconv Attribute | +|-------------|------------------------------|------------------------| +| IP address | HTTP server spans, logs, network metrics | `client.address`, `network.peer.address` | +| User ID | Span attributes, baggage, log body | `enduser.id` (deprecated → `user.id`) | +| Email, name, phone | Log body, error traces, baggage | None — must not be added | +| Session token / cookie | HTTP headers, span attributes | `http.request.header.*` | +| Browser user-agent | HTTP server spans | `user_agent.original` | +| Geolocation | IP-derived enrichment, custom attrs | `geo.city`, `geo.country_iso_code` | +| Query string parameters | URL full string | `url.full`, `url.query` | +| Stack trace with user data | Exception events | `exception.stacktrace` | +| Request body on error | Trace event attributes | Custom — avoid entirely | +| Baggage values | W3C Baggage header, propagated downstream | — (see `../boundaries/cross-application.md §Baggage rules`) | + +Baggage is especially dangerous: values propagate across service boundaries and can be logged or traced by any downstream collector. Apply trust-boundary filters before injecting user-derived values into baggage. + +--- + +## 5. PII Handling Rules by Field + +| Field | Required Action | Technique | +|-------|----------------|-----------| +| IPv4 address | Mask last octet (`192.0.2.0`) or drop | Truncation / drop | +| IPv6 address | Mask last 80 bits or drop | Truncation / drop | +| Email address | Drop local part (`***@example.com`) or drop entirely | Redaction | +| User ID (internal) | Salted HMAC with vault-managed key | Pseudonymization | +| Card number, SSN | Drop entirely — never collect | Drop | +| Phone number | Drop entirely | Drop | +| Session token, password, `Authorization` header | Drop entirely | Drop | +| Timestamp (high-resolution) | Truncate to minute (metrics) or hour (logs) depending on aggregation tier | Generalization | +| Geolocation (precise) | Truncate to city or region | Generalization | +| User-agent string | Keep major browser family only, or drop | Generalization / drop | +| Query string | Drop or allowlist known safe params | Drop / allowlist | +| Request body | Drop on error traces; never capture by default | Drop | + +--- + +## 6. OTel Collector Processors for Redaction + +Use a layered pipeline: **attributes** (known PII keys) → **transform/OTTL** (pattern-based) → **redaction** (safety-net allowlist) → storage. + +### 6.1 `attributes` Processor — Named Key Actions + +```yaml +processors: + attributes/minimize: + actions: + - key: user.email + action: delete + - key: user.id + action: hash + - key: http.request.header.authorization + action: delete + - key: http.request.header.cookie + action: delete + - key: url.query + action: delete +``` + +> **Warning on `action: hash`**: the Collector's built-in `hash` action does not apply a salt. For low-entropy inputs such as numeric user IDs, an unsalted hash is reversible by brute force and is treated as pseudonymization (not anonymization) under GDPR. See §8 for salted-hash alternatives at the SDK layer. + +### 6.2 `transform` Processor (OTTL) — Pattern-Based Transformation + +```yaml +processors: + transform/ip_mask: + error_mode: ignore + trace_statements: + - context: span + statements: + # IPv4: replace last octet with .0 + - replace_pattern(attributes["client.address"], "\\.\\d+$", ".0") + # Drop precise geolocation + - delete_key(attributes, "geo.coordinates") + # Truncate email to domain only + - replace_pattern(attributes["user.email"], "^[^@]+", "***") +``` + +### 6.3 `redaction` Processor — Allowlist Safety Net + +```yaml +processors: + redaction/allowlist: + allow_all_keys: false + allowed_keys: + - http.method + - http.status_code + - http.route + - service.name + - trace.id + - span.id + - db.system + - rpc.system + blocked_values: + - "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" # email pattern + - "\\b(?:\\d[ -]?){13,16}\\b" # card number pattern + summary: debug +``` + +### 6.4 Pipeline Assembly + +```yaml +service: + pipelines: + traces: + receivers: [otlp] + processors: + - attributes/minimize + - transform/ip_mask + - redaction/allowlist + - batch + exporters: [otlp/backend] +``` + +--- + +## 7. SDK-Layer Redaction (Prefer over Pipeline) + +Redacting at the instrumentation layer is always preferable: PII never enters the pipeline or wire. + +| SDK / Tool | Mechanism | +|------------|-----------| +| OTel SDK (all languages) | Attribute processor on `TracerProvider` at construction time | +| Sentry | `before_send` / `before_send_transaction` callbacks | +| Datadog | `before_send` callback; `obfuscation_config` in agent | +| OpenTelemetry JS/Python | `SpanProcessor` with custom `onStart` / `onEnd` | + +**Rule.** SDK-layer redaction is the first line of defense. Collector processors are the second line. Treat collector processors as a safety net, not the primary mechanism. + +--- + +## 8. Salted Hashing Caveats + +OTel Collector's built-in `hash` action uses SHA-256 **without salt**. On low-entropy inputs (numeric user IDs, 6-digit codes), this is reversible via brute-force rainbow tables. + +**Requirements for safe pseudonymization:** + +1. Apply salt using a vault-managed key (e.g., HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager). +2. Store the salt/key in a separate region and access-controlled vault — never co-located with the pseudonymized data (GDPR Art. 32). +3. Rotate the salt on a defined schedule (quarterly recommended); document rotation in the key management policy. +4. Use HMAC-SHA256 with the salt, not bare SHA-256. +5. Because the Collector `hash` action cannot inject vault-managed salts, perform HMAC at the **SDK layer** before emitting spans. + +```python +# SDK-layer HMAC pseudonymization (Python example) +import hmac, hashlib, os + +def pseudonymize_user_id(user_id: str, vault_key: bytes) -> str: + return hmac.new(vault_key, user_id.encode(), hashlib.sha256).hexdigest() +``` + +--- + +## 9. Retention and Right to Erasure (GDPR Art. 17) + +| Tier | Content | Max Retention | Erasure Support | +|------|---------|---------------|-----------------| +| Raw (with PII) | Full spans, logs, unredacted | 7 days | Automated on data-subject request | +| Redacted short | PII removed, full attributes | 30 days | N/A (no PII present) | +| Anonymized long | Aggregated metrics, anonymized traces | 90+ days | N/A (irreversible) | +| Audit events | Immutable access/action log | 7–10 years | Not erasable (legal obligation) | + +Erasure pipeline: data-subject request → identity verification → lookup raw-tier records by pseudonymized ID → delete → log erasure action in audit trail. + +Cross-reference: `../meta-observability.md §Retention Matrix` for full retention policy. Cross-reference: `audit.md` for erasure audit trail requirements. + +--- + +## 10. Cross-Border Transfer + +GDPR Chapter V prohibits transfer of personal data to non-adequate countries without a legal mechanism. + +| Mechanism | When to Use | +|-----------|-------------| +| Standard Contractual Clauses (SCC) | EU → US, EU → most non-adequate countries | +| Binding Corporate Rules (BCR) | Intra-group transfers | +| Adequacy decision | EU → UK, Japan, Canada (PIPEDA), South Korea (partial) | +| PIPA § 17 | KR → non-KR: contractual safeguards + PIPC notification | + +**Collector routing for regional compliance.** + +```yaml +# Route EU telemetry to EU backend, KR to KR backend +connectors: + routing: + default_pipelines: [traces/global] + table: + - statement: attributes["deployment.region"] == "eu-west-1" + pipelines: [traces/eu] + - statement: attributes["deployment.region"] == "ap-northeast-2" + pipelines: [traces/kr] +``` + +Cross-reference: `../transport/collector-topology.md §Multi-cluster + regional`. + +--- + +## 11. TLS Context Attributes (L6 Development Status) + +OTel `tls.*` semantic conventions are **Development** stability tier as of semconv 1.27.0. See `../standards.md` for stability tier definitions. + +| Attribute | Type | Use Case | +|-----------|------|----------| +| `tls.protocol.version` | string | Detect TLS 1.0/1.1 downgrade attacks | +| `tls.cipher` | string | Flag weak ciphers (RC4, DES, 3DES) | +| `tls.established` | boolean | Alert on handshake failures | +| `tls.resumed` | boolean | Session resumption analysis | +| `tls.server.certificate.expiry` | int (epoch) | Certificate expiry alerting | + +**Scope boundary.** These attributes provide *security context* for observability purposes (downgrade detection, cert expiry alerting). Full TLS inspection (decrypting payload for content analysis) is out of scope — use dedicated network inspection tooling. + +**Privacy note.** TLS cipher and protocol version are not PII. Certificate subject fields (CN, SAN) may contain hostnames; do not log end-user certificate subjects from mTLS without legal basis. + +--- + +## 12. Backend Access Control (RBAC on Observability Data) + +Production traces and logs containing even pseudonymized data are sensitive. Access must be role-gated. + +| Role | Scope | Time Range | Example Tools | +|------|-------|-----------|---------------| +| On-call engineer | Own service traces + logs | Last 24h | Grafana folder permission | +| Incident responder | Cross-service traces | Incident window | Datadog Teams, Honeycomb environments | +| Security analyst | All services, security events | 30 days | Separate security-tier datasource | +| Auditor | Audit events only (see `audit.md`) | Full retention | Read-only audit index | +| Finance | Cost metrics only | 90 days | See `cost.md` | +| DPO / Privacy engineer | Redaction pipeline config, raw tier | Short window only | Separate admin role | + +**Environment isolation.** Production backends must be strictly separated from non-production. Never allow dev/staging pipelines to receive or store production telemetry — this is a direct PII leak path. + +Vendor-specific RBAC: Grafana folder permissions + team sync; Datadog Teams with scoped monitors; Honeycomb environments with team-scoped API keys. + +--- + +## 13. Third-Party Processor Obligations (GDPR Art. 28) + +Observability vendors receiving telemetry containing personal data are **data processors** under GDPR. A Data Processing Agreement (DPA) is mandatory before routing data to them. + +| Obligation | Action Required | +|------------|----------------| +| DPA in place | Signed before any data flows to vendor | +| Sub-processor transparency | Vendor must disclose and notify of sub-processor changes | +| Data residency | Confirm vendor's storage region matches your compliance obligations | +| Deletion / return | Vendor must support data deletion on contract termination | + +Common vendors requiring DPA review: Datadog, Honeycomb, Grafana Cloud, Sentry, New Relic, Elastic Cloud. Each vendor publishes a DPA at their legal/privacy page. Reference your legal team's approved vendor list before routing production data. + +--- + +## 14. Matrix Coverage Notes + +The following cells in `../matrix.md` have privacy-specific guidance from this file: + +| Layer × Boundary | Privacy Status | Notes | +|-----------------|---------------|-------| +| L3 (Network) × any | ⚠️ Warning | IP addresses are GDPR Art. 4(1) personal data identifiers; mask or drop | +| L4 (Transport) × any | ⚠️ Warning | Connection 5-tuple (src IP, dst IP, ports) can identify individuals | +| L7 (Application) × multi-tenant | Primary coverage | Per-tenant data minimization; tenant ID must not leak across boundaries | +| L7 × cross-application | ⚠️ Warning | Baggage crosses trust boundaries; apply filter at ingress (§4) | +| Release / feature flags × any | Overlap | Feature flag cohort membership may constitute profiling under GDPR Art. 4(4) | + +--- + +## 15. Anti-Patterns + +> These entries feed into `../anti-patterns.md §Section A Privacy`. + +| Anti-Pattern | Risk | Remediation | +|-------------|------|-------------| +| Claiming "anonymization" for pseudonymized data | 4% GDPR penalty; regulatory misrepresentation | Apply reversibility test (§3); re-classify and update ROPA | +| Hash without salt on low-entropy IDs | Rainbow-table reversal; GDPR breach | HMAC-SHA256 with vault-managed salt at SDK layer (§8) | +| Salt/key stored in same region as pseudonymized data | Key compromise = full de-anonymization | Separate vault in independent region with independent IAM (GDPR Art. 32) | +| Baggage crossing trust boundary without filter | PII propagated to untrusted downstream collectors | Apply baggage allowlist at service ingress (cross-ref `../boundaries/cross-application.md`) | +| Observability backend open to all engineers (no RBAC) | PII exposure; compliance failure | Implement role-scoped access per §12 | +| Routing to 3rd-party vendor without DPA | GDPR Art. 28 violation | Block data flow until DPA is signed (§13) | +| Capturing raw request body on error trace | Request bodies frequently contain credentials, PII | Drop request body by default; allowlist specific non-sensitive fields at SDK layer (§7) | +| Cross-region telemetry routing without GDPR mechanism | GDPR Chapter V violation | Implement routing connector per §10; confirm SCC or adequacy decision | +| OTel Collector `hash` action on production user IDs | Unsalted; brute-force reversible | Migrate to SDK-layer HMAC with vault key (§8) | +| Logging `user_agent.original` without assessment | User-agent can be a unique identifier for profiling | Generalize to browser family only or drop (§5) | + +--- + +## References + +- GDPR text: <https://gdpr-info.eu> +- PIPA (Korean): <https://www.pipc.go.kr> +- OTel sensitive data handling: <https://opentelemetry.io/docs/security/handling-sensitive-data/> +- OTel Collector processors: <https://opentelemetry.io/docs/collector/configuration/> +- W3C Baggage PII warning: <https://www.w3.org/TR/baggage/> +- Cross-references: `../standards.md`, `../matrix.md`, `../meta-observability.md`, `../boundaries/cross-application.md`, `../transport/collector-topology.md`, `audit.md`, `cost.md` diff --git a/.agents/skills/oma-observability/resources/signals/profiles.md b/.agents/skills/oma-observability/resources/signals/profiles.md new file mode 100644 index 0000000..0cec4e0 --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/profiles.md @@ -0,0 +1,245 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +notes: + - "Profiles signal (OTEP 0239) is alpha; semconv in development" +--- + +# Profiles Signal + +> **EXPERIMENTAL — OTEP 0239 alpha (as of 2026-Q2)** +> Continuous profiling tooling (Parca, Pyroscope) is production-ready. +> The OpenTelemetry profiling data model and OTLP signal are in alpha / active development — semantic conventions and wire format may change without backward compatibility. +> Do NOT define SLOs against OTel-native profiling attributes. Treat any `profile.*` OTel semconv as experimental tier. +> Cross-ref: `../standards.md §3 OTel semconv stability` for tier definitions. + +--- + +## 1. Scope + +Continuous profiling — whole-cluster or whole-service CPU, memory, and allocation sampling running at low, steady overhead in production. Flamegraphs are the primary visualization artifact. The output answers the question "in which function is time or memory being spent?" + +This file does NOT cover ad-hoc developer profiling sessions (e.g., `go tool pprof http://localhost:6060/debug/pprof/profile` run once during a development investigation). Those are single-use diagnostics. Continuous profiling generates persistent, comparable data over time and is as much a production signal as metrics or logs. + +--- + +## 2. The 5th Pillar — MELT to MELT+P + +Observability was originally described as three pillars (Metrics, Logs, Traces). Events as LogRecords and cost extended that. Profiles add the fifth pillar. + +| Signal | Answers | +|--------|---------| +| Metrics | What is slow / broken? (rate, error rate, latency quantile) | +| Traces | Where is it slow? (which service, which span, which dependency) | +| Profiles | In which function is CPU time or memory spent? (line-level attribution) | +| Logs | Why, in context? (structured event with variables and error detail) | +| Cost | What is the financial impact of the above? | + +Correlation is the mechanism that makes MELT+P actionable rather than five isolated views. Profiles share resource labels (`service.name`, `service.version`, `k8s.pod.name`) with metrics and traces, and can be linked to a specific trace by attaching a `trace_id` label to profile samples — a pattern pioneered by Grafana Tempo + Pyroscope. + +--- + +## 3. Tools Landscape (as of 2026-Q2) + +Verify live status at <https://landscape.cncf.io>. + +| Tool | Category | Mechanism | CNCF Status | Notes | +|------|----------|-----------|-------------|-------| +| Parca | OSS, continuous | eBPF kernel-level, no agent code | Sandbox | Written in Go; Polar Signals led; whole-cluster, no language SDK required for CPU profiles | +| Grafana Pyroscope | OSS / SaaS, continuous | Pull + push (eBPF + language SDKs) | Sandbox (verify — CNCF status uncertain post-Grafana Labs acquisition, 2023) | Formerly independent; Grafana acquired 2023; integrated into Grafana stack | +| Polar Signals Cloud | Commercial SaaS | Parca-based, hosted | — | Commercial offering backed by Parca OSS; Polar Signals is the primary Parca maintainer | +| Go `net/http/pprof` | Language-specific, on-demand | HTTP handler; sample on request | — | Built-in; not continuous; heavier per request; MUST NOT be left exposed on production endpoints | +| Fgprof | Language-specific | Wall-clock profiler for Go | — | Supplements `pprof` by measuring off-CPU time as well as on-CPU | +| py-spy | Language-specific, continuous | Native code (Rust); attaches without code change | — | Low overhead; works on CPython; no SDK injection | +| pyroscope-python | Language-specific, continuous | Python SDK; sends to Pyroscope server | — | Requires SDK import; supports async runtimes | +| async-profiler | JVM, continuous | JVMTI + `perf_events`; async-safe | — | Primary choice for JVM profiling; supports CPU, alloc, lock, wall modes | +| JFR (Java Flight Recorder) | JVM, continuous | JVM-native; available from JDK 11+ | — | Enable with JVM flags; low overhead; integrates with async-profiler | +| `0x` / V8 Inspector | Node.js, on-demand | V8 CPU profiler | — | `0x` wraps `node --prof` for flamegraph generation | +| pyroscope-node | Node.js, continuous | Node SDK; sends to Pyroscope | — | Continuous alternative to `0x` | +| stackprof | Ruby, continuous | Sampling profiler | — | Low overhead; supports wall, object, custom modes | +| php-spx / phpspy | PHP, continuous | Extension (spx) or attach (spy) | — | `php-spx` requires extension install; `phpspy` attaches without code change | +| Beyla / Pixie | eBPF, language-agnostic | eBPF user-space probes | Beyla CNCF Incubating (2024); Pixie CNCF Sandbox | No language integration needed; covers Go, Python, JVM, Node.js for latency profiling | + +**eBPF unified approach note:** For CPU and off-CPU profiling without language-level SDK integration, Parca, Beyla, and Pixie are the primary choices. They instrument at the kernel/eBPF level, making them language-agnostic for CPU time attribution. + +--- + +## 4. Sampling Overhead + +Continuous profiling is viable in production only when overhead is provably low. + +| Tool / Mode | Typical CPU overhead | Sampling rate | Notes | +|------------|---------------------|---------------|-------| +| Parca (eBPF) | < 1% CPU | ~19 Hz per CPU core | Kernel-level; process namespaces handled correctly | +| Pyroscope (eBPF mode) | < 1% CPU | Configurable, default ~100 Hz | Overhead scales with sampling rate; start conservatively | +| async-profiler (CPU) | < 1% CPU | Configurable; default 100 Hz | AsyncGetCallTrace avoids safe-point bias | +| JFR (CPU + alloc) | < 1% CPU typical | JVM-managed | Monitor GC pause impact separately | +| py-spy | < 1% CPU | Configurable; default 100 Hz | External; no Python overhead | +| `net/http/pprof` | ~5-10% during active profile | On-demand only | Never enable `/debug/pprof` permanently on production endpoints — see anti-patterns | + +Always measure baseline and profiled CPU utilization in staging before enabling continuous profiling in production. Set a budget threshold (e.g., < 1% CPU) and alert if actual overhead exceeds it. + +--- + +## 5. OTEP 0239 — OTel Profiling Signal Data Model + +Source: <https://github.com/open-telemetry/oteps/blob/main/text/profiles/0239-profiles-data-model.md> + +Status: **alpha / in active development (2026-Q2)**. The data model, wire format, and semantic conventions are all subject to breaking change. + +Key design points: +- Profiles are transmitted as a fourth OTLP signal alongside metrics, traces, and logs. A new `ExportProfilesServiceRequest` protobuf message parallels `ExportMetricsServiceRequest`. +- The data model derives from the pprof format (Google) — a profile is a directed call graph with sample counts per stack frame. +- Proposed profile types: CPU, Heap, Allocation, Mutex, Lock. Stored in `profile.type` attribute. +- `profile.name` identifies the profile within a service and collection window. +- Semantic conventions for profile attributes (e.g., mapping to `service.*`, `k8s.*`, `thread.*`) are in draft. Do not treat them as stable. + +``` +# Conceptual OTLP profile payload (pseudocode — not final spec) +ProfilesData { + resource_profiles: [ + ResourceProfiles { + resource: { attributes: [service.name, service.version, k8s.pod.name] } + scope_profiles: [ + ScopeProfiles { + profiles: [ + Profile { + profile_id: <16-byte random> + start_time_unix_nano: ... + end_time_unix_nano: ... + sample_type: [{ type: "cpu", unit: "nanoseconds" }] + sample: [{ location_id: [...], value: [...], label: [...] }] + } + ] + } + ] + } + ] +} +``` + +Most production deployments (2026-Q2) use Parca's own gRPC API or Pyroscope's push API, not OTLP profiles. OTLP profile support in collectors and backends is experimental. Plan OTLP profile adoption only after the spec reaches RC status. + +--- + +## 6. Use Cases + +| Use case | Profile type | How to investigate | +|----------|-------------|-------------------| +| CPU bottleneck — which function consumes CPU? | CPU | Flamegraph: wide frame = high CPU; sort by self-time | +| Memory leak — heap growing over time? | Heap | Heap profile over hours; compare allocation counts | +| Allocation rate — GC pressure? | Allocation | Allocation profile; identify hot allocation sites | +| Lock contention — threads blocked? | Mutex / Lock | Mutex profile; look for frames holding locks across many samples | +| Off-CPU — threads blocked on I/O or sleep? | Off-CPU (eBPF) | eBPF off-CPU flamegraph; I/O, syscall, and scheduler frames visible | + +--- + +## 7. Correlation with Traces + +Profiles are the diagnostic tool you reach for after traces identify which service and span is slow. The linkage mechanism is attaching the current `trace_id` as a label on profile samples. + +``` +# Pyroscope Go SDK — attach trace_id to profile (conceptual) +pyroscope.TagWrapper(ctx, pyroscope.Labels("trace_id", span.SpanContext().TraceID().String()), func() { + // code under profiling +}) +``` + +Grafana Tempo + Pyroscope integration implements this as a first-class UI feature: clicking a slow trace span offers a "View Profile" link that loads the flamegraph for that time window and service. + +**Limitation:** Profiles are statistical (sampled across all requests). Traces are per-request. The correlation is probabilistic — a trace_id label on a profile sample means that request was running during that sample, not that the sample is causally attributed to that exact request. This is a fundamental constraint of sampling-based profiling; document it clearly when presenting correlation results to stakeholders. + +--- + +## 8. Mobile and Client-Side Profiling + +Client-side continuous profiling in production is not yet established practice. Platform tools provide opt-in session profiling: + +| Platform | Tool | Mode | Notes | +|----------|------|------|-------| +| iOS | Instruments (Time Profiler, Allocations) | Developer / TestFlight | Requires Xcode; not continuous in App Store builds | +| Android | Android Studio Profiler (CPU, Memory) | Developer / debug build | Production CPU profiling requires `<profileable>` manifest flag | + +For production mobile observability, rely on crash analytics and RUM metrics to surface performance regressions. Cross-ref `../layers/L7-application/mobile-rum.md` for mobile RUM patterns and `../layers/L7-application/crash-analytics.md` for crash rate metrics. + +--- + +## 9. Storage Backends + +| Backend | Primary use | Symbol resolution | +|---------|------------|-------------------| +| Parca server | OSS; local or k8s deployment; uses Parquet-backed storage | Upload ELF debug info / DWARF symbols; agent resolves at collection time | +| Pyroscope OSS | Self-hosted; integrates with Grafana stack | Upload symbol files per build | +| Grafana Cloud Profiles | Managed Pyroscope; SaaS | Upload symbols via Pyroscope API | +| Polar Signals Cloud | Managed Parca; SaaS; automatic symbol upload | Symbols uploaded via CI step; no manual resolution needed | + +Symbol resolution is a critical operational concern. Without debug symbols (or source maps for Node.js), flamegraph frames appear as raw memory addresses or mangled names. Treat symbol upload as a required CI step alongside binary deployment — it has the same relationship to profiles as sourcemaps have to crash analytics. + +--- + +## 10. Privacy and Security + +Stack traces expose function names, which may reveal proprietary algorithm structure or internal service architecture. This is especially sensitive for multi-tenant SaaS operators. + +- Apply access control to the profiling backend (Parca server, Pyroscope) equivalent to trace backend access control. Cross-ref `../signals/privacy.md §Backend RBAC`. +- For multi-tenant deployments, isolate flamegraph access by tenant label. A tenant MUST NOT be able to query another tenant's flamegraph. +- Function name redaction (stripping internal frames from exported profiles) is technically possible but rare and operationally complex. Consider it only if profile data is shared externally (e.g., vendor support handoff). + +--- + +## 11. Matrix Cells — Profiles Column Summary + +Cross-ref: `../matrix.md` for full cell detail. Summary for the profiles signal column: + +| Layer × Boundary | profiles cell | Rationale | +|-----------------|---------------|-----------| +| L3 × all | ❌ N/A | IP routing has no process-level profiling artifact | +| L4 × multi-tenant | ✅ | eBPF CPU/off-CPU profiling covers socket-level overhead per process | +| L4 × cross-application | ✅ | eBPF off-CPU shows network-wait time between services | +| L4 × slo | ❌ N/A | SLO is application-defined; L4 profiles are diagnostic, not SLO inputs | +| L4 × release | ❌ N/A | Release events are L7 constructs | +| mesh × all | ⚠️ | Proxy CPU visible via eBPF on sidecar process; not mesh-native | +| L7 × multi-tenant | ✅ | Per-tenant flamegraph via label-based isolation in Parca/Pyroscope | +| L7 × cross-application | ✅ | Application-level profiling with trace_id correlation | +| L7 × slo | ❌ N/A | SLO is computed from metrics; profiles are diagnostic | +| L7 × release | ⚠️ | Pre/post-release profile diff for performance regression detection | + +--- + +## 12. Anti-Patterns + +These candidates belong in `../anti-patterns.md` and are flagged here for consolidation. + +| Anti-pattern | Risk | Mitigation | +|-------------|------|-----------| +| Continuous profiling enabled in production without overhead measurement in staging | Unexpected CPU spike > 1%; service degradation | Benchmark overhead in staging first; set alert on profiler CPU budget | +| Symbol upload missing at deploy time | Flamegraphs show raw addresses or mangled names; engineers cannot act on data | Add symbol upload as a required CI step immediately after binary upload; gate deployment on symbol upload success | +| `/debug/pprof` endpoint left enabled on production HTTP servers | Exposes CPU/heap profiles and internal function names to any caller who discovers the path; also a denial-of-service vector (profile requests are expensive) | Disable `net/http/pprof` import in production builds, or restrict to internal network behind authentication middleware | +| Profile labels include high-cardinality dimensions (e.g., `request_id`, `user_id`) | Storage blowup; query performance degradation in Parca/Pyroscope backend | Label with low-cardinality dimensions only: `service.name`, `service.version`, `tenant.id` (for multi-tenant), `environment` | + +--- + +## 13. References + + +Internal cross-references: + +- `../standards.md §3` — OTel semconv stability tiers (experimental badge rationale) +- `../matrix.md` — full 112-cell coverage matrix +- `../layers/L4-transport.md` — eBPF profiling at socket level +- `../layers/L7-application/mobile-rum.md` — mobile performance signals +- `../layers/L7-application/crash-analytics.md` — crash rate and symbolication +- `../boundaries/multi-tenant.md` — per-tenant label isolation +- `../boundaries/release.md` — pre/post-release profile comparison +- `../signals/privacy.md` — backend RBAC and access control +- `../anti-patterns.md` — consolidated anti-pattern list + +## References + +- OTEP 0239 (profiling data model) — <https://github.com/open-telemetry/oteps/blob/main/text/profiles/0239-profiles-data-model.md> +- Parca — <https://parca.dev> +- Grafana Pyroscope — <https://grafana.com/oss/pyroscope/> +- Polar Signals Cloud — <https://polarsignals.com> +- async-profiler — <https://github.com/jvm-profiling-tools/async-profiler> +- py-spy — <https://github.com/benfred/py-spy> +- CNCF Landscape (verify current CNCF status) — <https://landscape.cncf.io> diff --git a/.agents/skills/oma-observability/resources/signals/traces.md b/.agents/skills/oma-observability/resources/signals/traces.md new file mode 100644 index 0000000..91992ec --- /dev/null +++ b/.agents/skills/oma-observability/resources/signals/traces.md @@ -0,0 +1,360 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +specs: + - "W3C Trace Context: Level 1 Recommendation 2020-02-06" +--- + +# Distributed Traces Signal + +## 1. Scope + +Traces are the "T" in MELT+P — the correlation backbone of distributed observability. A single trace links every span produced by every service that participated in one request, job, or message. + +Covers: OTel span data model, SpanKind, W3C Trace Context, DB patterns (N+1, slow query, pool exhaustion, `db.*` Stable), messaging patterns (Kafka, Flink, Spark, DLQ, `messaging.*`), RPC (`rpc.*` RC), sampling, backends, exemplars, baggage security. + +Cross-references: +- `../boundaries/cross-application.md` — propagator matrix per cloud and mesh vendor +- `../layers/mesh.md` — zero-code auto-instrumentation; propagator headers + +--- + +## 2. OTel Span Data Model (Stable) + +Source: <https://opentelemetry.io/docs/specs/otel/trace/api/> + +Every span carries the following fields: + +| Field | Type | Description | +|-------|------|-------------| +| `TraceId` | 16-byte hex | Globally unique identifier for the entire trace | +| `SpanId` | 8-byte hex | Unique identifier for this span | +| `ParentSpanId` | 8-byte hex | SpanId of the parent; absent for root spans | +| `Name` | string | Operation name (e.g., `HTTP GET /api/orders`) | +| `Kind` | enum | SpanKind — see table below | +| `StartTime` | timestamp | Monotonic clock at span start | +| `EndTime` | timestamp | Monotonic clock at span end | +| `Attributes` | key-value map | Semantic convention attributes describing the operation | +| `Events` | list | Timestamped structured log records within the span | +| `Links` | list | References to other spans (used for async messaging) | +| `Status` | enum + string | `UNSET`, `OK`, or `ERROR` with optional description | + +### SpanKind + +SpanKind describes the role of the span in the larger distributed operation. Backends use it to render topology maps and assign latency accountability. + +| SpanKind | Role | Typical origin | +|----------|------|----------------| +| `INTERNAL` | Default; in-process operation with no network boundary | Business logic, cache lookup, CPU computation | +| `SERVER` | Incoming synchronous request handled by this service | HTTP server, gRPC server handler | +| `CLIENT` | Outgoing synchronous call from this service to another | HTTP client, DB driver, gRPC stub | +| `PRODUCER` | Async message sent to a queue or topic | Kafka producer, RabbitMQ publisher | +| `CONSUMER` | Async message received from a queue or topic | Kafka consumer, worker pulling from queue | + +Rule of thumb: sync RPC produces a `CLIENT`+`SERVER` pair sharing the same `TraceId`. Async messaging produces a `PRODUCER`+`CONSUMER` pair connected by a span link (not parent-child), because the two operations run independently. + +--- + +## 3. W3C Trace Context Propagation + +Source: <https://www.w3.org/TR/trace-context/> + +### Header format + +``` +traceparent: 00-<trace-id>-<span-id>-<flags> +``` + +- `00` — version (currently always `00`) +- `<trace-id>` — 32 hex chars (128-bit TraceId) +- `<span-id>` — 16 hex chars (64-bit SpanId of the sending span) +- `<flags>` — 8-bit flags; `01` = sampled, `00` = not sampled + +Example: + +``` +traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01 +tracestate: vendor1=abc123,vendor2=xyz789 +``` + +`tracestate` carries vendor-specific opaque state as a comma-separated `key=value` list forwarded unchanged by intermediaries that do not own the key. + +### Propagation rules + +- **Inject** `traceparent` on every outbound HTTP/gRPC call; **extract** at every inbound call before creating the root span. +- Stripping `traceparent` silently is anti-pattern #1 in `../anti-patterns.md`. +- Cross-cloud/mesh vendor header bridging: `../boundaries/cross-application.md §Propagators`. +- Mesh-injected headers (Envoy, Linkerd): `../layers/mesh.md §Propagator headers`. + +--- + +## 4. Database Tracing Patterns (db.* semconv Stable) + +Source: <https://opentelemetry.io/docs/specs/semconv/database/> + +All DB spans use `SpanKind = CLIENT`. The span name follows the pattern `<db.operation.name> <db.collection.name>` (e.g., `SELECT orders`). + +### Core db.* attributes + +| Attribute | Stability | Example | Notes | +|-----------|-----------|---------|-------| +| `db.system` | Stable | `postgresql`, `mysql`, `mongodb`, `redis` | Required; identifies the DB technology | +| `db.operation.name` | Stable | `SELECT`, `INSERT`, `HMGET` | SQL verb or command name | +| `db.query.text` | Stable | `SELECT * FROM orders WHERE id = $1` | Parameterized query text; PII risk — see caveat | +| `db.namespace` | Stable | `mydb` | Database name / schema | +| `db.collection.name` | Stable | `orders` | Table or collection name | +| `db.client.connections.used` | Development | `14` | Active connections in pool (attribute name varies by semconv draft — pin to version in `../standards.md`) | +| `db.client.connection.pool.utilization` | Development | `0.875` | Ratio: used / max (current semconv draft name; older drafts use `db.client.connections.usage`) | + +`db.query.text` caveat: WHERE clause literals may contain PII (email addresses, phone numbers). Always use parameterized queries so that literals are replaced by `$1`, `?`, or `:name` placeholders. If the ORM or driver captures the raw query, apply redaction at the OTel SDK layer or in the Collector processor. Cross-ref `../signals/privacy.md`. + +### 4.1 N+1 Query Detection + +Symptom: a single parent `SERVER` span contains N child `CLIENT` spans that all share the same `db.operation.name` and `db.collection.name`, each taking 2-30 ms, producing an additive latency of N × latency_per_query. + +Trace waterfall example: + +``` +[SERVER] POST /api/orders .................... 480ms + [CLIENT] SELECT orders WHERE ... .. 18ms + [CLIENT] SELECT orders WHERE ... .. 17ms + [CLIENT] SELECT orders WHERE ... .. 17ms + [CLIENT] SELECT orders WHERE ... .. 18ms + ... (× 24 more) +``` + +Detection: + +- OTel trace visualizers (Honeycomb BubbleUp, Datadog Watchdog, Grafana Tempo TraceQL) can flag traces where `count(db.CLIENT spans per SERVER span) > N_threshold`. +- TraceQL example (Grafana Tempo): `{ span.db.system = "postgresql" } | count() > 10` +- Honeycomb: group by `trace.parent_id`, count `db.*` child spans, alert on p95 count > 10. + +Remediation: replace per-row lookups with eager loading (ORM `include`/`join`), batch queries (`SELECT ... WHERE id = ANY($1)`), or the DataLoader pattern for GraphQL. + +### 4.2 Slow Query Analysis + +Slow queries surface as `CLIENT` spans with duration above the p99 threshold. + +1. Index `db.query.text` + `db.operation.name` against a `span.duration` histogram in the trace backend. +2. Flag spans where `span.duration > 100ms` at p99 (adjust threshold per SLO). +3. Correlate `db.query.text` with `pg_stat_statements` / MySQL `slow_query_log` to obtain execution plans. +4. Apply `EXPLAIN ANALYZE` and add the missing index or rewrite the query. + +Privacy: if `db.query.text` captures literal values, redact before export — see `../signals/privacy.md §query text redaction`. + +### 4.3 Connection Pool Exhaustion + +Symptom: the blocking span is on the **client side** (waiting to acquire a pool connection), not inside the DB. `CLIENT` span duration grows while actual DB response time stays flat. + +``` +[SERVER] POST /api/orders .............. 5200ms + [CLIENT] acquire pool connection ............ 5190ms ← pool wait, no DB call yet + [CLIENT] SELECT orders WHERE ... . 10ms +``` + +Attributes to monitor: `db.client.connections.used` (Development), `db.client.connections.usage` (Development, alert at > 0.85). Cross-ref `../signals/metrics.md` for `pool_wait_count` and `pool_wait_duration_seconds` histograms. + +Remediation: tune pool size to expected concurrency (10-20 connections per pod); add circuit breaker upstream; split read/write pools for long-running read transactions. + +--- + +## 5. Messaging Tracing Patterns (messaging.* semconv) + +Source: <https://opentelemetry.io/docs/specs/semconv/messaging/> + +### Core messaging.* attributes + +| Attribute | Example | Notes | +|-----------|---------|-------| +| `messaging.system` | `kafka`, `rabbitmq`, `nats` | Required | +| `messaging.destination.name` | `orders-topic` | Topic or queue name | +| `messaging.operation` | `publish`, `create`, `receive`, `deliver`, `settle`, `process` | Operation type per semconv 1.27.0 (note: the attribute is spelled `messaging.operation.type` in latest drafts; pin via `../standards.md`). `settle` covers ack/nack; `ack` alone is NOT a valid enum value | +| `messaging.message.id` | `msg-uuid-1234` | Message identifier for dedup | +| `messaging.kafka.message.key` | `order-9988` | Kafka partition key | +| `messaging.kafka.consumer.group` | `payment-consumer-group` | Kafka consumer group | + +### SpanKind assignment + +- `PRODUCER` — set when calling `producer.send()` or equivalent. +- `CONSUMER` — set when calling `consumer.poll()` or the message handler. +- Connect the two with a **span link** (not parent-child): async messaging breaks the synchronous call chain; span links preserve trace continuity across the queue boundary. + +``` +[PRODUCER] send orders-topic .. 3ms TraceId: aaa + Link → [CONSUMER] receive orders-topic ... 220ms TraceId: aaa + [CLIENT] INSERT INTO payments .. 18ms +``` + +### 5.1 Kafka-Specific Patterns + +Inject W3C `traceparent` into Kafka message headers at the producer; extract at the consumer before creating the `CONSUMER` span. OTel Kafka instrumentation handles this automatically (`opentelemetry-instrumentation-kafka` for Java/Python). + +Kafka consumer lag is an external metric, not a trace attribute: MSK (`kafka.consumer_lag` CloudWatch), Confluent Cloud (built-in), self-managed (`kafka_consumer_group_lag` via JMX + Prometheus). Cross-ref `../signals/metrics.md §messaging`. + +**Primary-topic lag alert threshold (recommended starting point):** +- Warn: `kafka_consumer_group_lag > 10_000 for 5m` (queue growing faster than consumer can drain) +- Page: `kafka_consumer_group_lag > 100_000 for 10m` (backlog will breach SLO if unaddressed) +- Calibrate per topic throughput and consumer group parallelism; high-throughput topics routinely run at 50k–500k lag without impact. + +### 5.2 Flink and Spark Streaming + +Create one span per pipeline stage; the trace covers end-to-end job from source read to sink write. Flink OTel instrumentation is emerging (not stable as of semconv 1.27.0) — use `messaging.system = kafka` for source/sink spans and `INTERNAL` for operator chains. Spark: instrument at the `foreachBatch` boundary; each micro-batch is one `INTERNAL` span. Checkpoint metrics: cross-ref `../signals/metrics.md §streaming`. + +### 5.3 Dead Letter Queue (DLQ) Observability + +A DLQ receives messages that failed all retries. Losing trace context at DLQ ingestion prevents diagnosis. + +1. **Trace propagation**: copy the original `traceparent` from the failed message headers into the DLQ message headers — do not generate a new trace ID. +2. **DLQ depth alert**: alert on `kafka_consumer_group_lag{topic="orders-dlq"}` > 0 for critical queues. +3. **DLQ arrival span**: create a `CONSUMER` span at DLQ arrival with `messaging.destination.name = orders-dlq` and the original `trace_id`. +4. **Replay tooling**: re-inject the original trace context into replayed messages. A new `trace_id` at replay orphans the original failure span. + +--- + +## 6. RPC Tracing (rpc.* semconv RC) + +Source: <https://opentelemetry.io/docs/specs/semconv/rpc/grpc/> + +### gRPC + +| Attribute | Example | +|-----------|---------| +| `rpc.system` | `grpc` | +| `rpc.service` | `com.example.OrderService` | +| `rpc.method` | `CreateOrder` | + +SpanKind: `CLIENT` on the caller; `SERVER` on the callee. The `traceparent` header is propagated via gRPC metadata. + +### HTTP RPC + +Use `http.*` attributes for REST and JSON-RPC over HTTP. `rpc.*` is for binary RPC protocols only. + +### Error handling + +Set `status.code = ERROR` and `status.description` on RPC failures. Use `span.recordException(e)` to atomically populate `exception.type`, `exception.message`, and `exception.stacktrace` — required by MRA in `../incident-forensics.md §2.2`. + +--- + +## 7. Trace Sampling + +Cross-ref `../transport/sampling-recipes.md` for full configuration recipes. + +| Strategy | Pros | Cons | +|----------|------|------| +| Head-based | Low overhead; decision at trace root | Cannot retain traces that only show problems at a downstream hop | +| Tail-based | Trace-complete decision; keeps all error traces | Requires buffering the full trace before deciding; higher Collector memory | + +Recommended recipe: + +- 100% of traces where `status.code = ERROR` +- 100% of traces where business cost > $0.50 (custom attribute) +- 5-10% tail sampling of remaining traces for baseline coverage + +For multi-replica tail sampling, use consistent hashing on `trace_id` via the OTel Collector `loadbalancing` exporter to route all spans of one trace to the same Collector instance. + +--- + +## 8. Trace Backends + +Cross-ref `../vendor-categories.md` for full vendor category taxonomy. + +| Backend | Category | Key differentiator | +|---------|----------|--------------------| +| Jaeger (CNCF Graduated) | OSS full-stack | Kubernetes-native; Badger or Cassandra storage | +| Grafana Tempo | OSS full-stack | Low-cost object storage (S3/GCS); TraceQL query language | +| Zipkin | OSS (legacy) | Older wire format; superseded by OTel/Jaeger for new deployments | +| Honeycomb | High-cardinality specialist | BubbleUp auto-analysis; best for high-attribute-count traces | +| Datadog APM | Commercial SaaS | Deep integration with Datadog metrics and logs | +| Elastic APM | Commercial SaaS | Tight Kibana/Elasticsearch integration | +| SigNoz | OSS full-stack | ClickHouse-backed; cost-efficient at high ingestion rates | + +--- + +## 9. Trace Exemplars + +Exemplars link a metric data point to a specific trace, enabling direct navigation from a PromQL alert to the trace that caused the spike. + +- Prometheus exemplar support is available since v2.31.0. Histograms and summaries can carry a `trace_id` label on selected samples. +- OTel SDK Java, Go, and Python emit exemplars automatically when the current span context is sampled. +- Usage: click a spike in a Grafana panel backed by a Prometheus histogram → the exemplar tooltip shows a `trace_id` link → click to open the trace in the configured trace backend (Tempo, Jaeger, etc.). + +```promql +# Example: p99 latency histogram with exemplar-capable query +histogram_quantile(0.99, + sum by (le, service_name) ( + rate(http_server_request_duration_seconds_bucket{ + deployment_environment="prod" + }[5m]) + ) +) +``` + +Cross-ref `../incident-forensics.md §Step 2` for the exemplar-based trace acquisition flow. + +--- + +## 10. Propagating Baggage (Security and Privacy) + +W3C Baggage (`baggage` header) carries key-value pairs alongside the trace context. It is not part of the trace itself — it is a side-channel for business attributes that downstream services need. + +### Allowed vs. prohibited baggage content + +| Allowed | Prohibited | +|---------|-----------| +| `tenant.id` | User email address | +| `user.tier` (tier name, not PII) | Session tokens or auth cookies | +| `deployment.sha` | Passwords or API keys | +| `feature.flag.name` | Any field classified as PII under GDPR/PIPA | + +W3C Baggage Recommendation §3.1: baggage propagates to all downstream services within the same distributed operation. Any data placed in baggage is visible to every service on the call path, including third-party services. Data that is proprietary, confidential, or personally identifiable MUST NOT be placed in baggage without explicit downstream trust agreement. + +Enforcement rule: strip or validate incoming baggage at the API gateway / ingress before forwarding to internal services. The mesh ingress gateway is the correct enforcement point — see `../layers/mesh.md §Baggage scrubbing`. + +Cross-ref: + +- `../boundaries/cross-application.md` — trust-boundary baggage validation rules per cloud and mesh +- `../signals/privacy.md` — PII classification and redaction for baggage content + +--- + +## 11. Matrix Coverage (traces column) + +These cells from `../matrix.md` have trace-specific behavior worth noting: + +| Layer | Boundary | Status | Rationale | +|-------|----------|--------|-----------| +| L3-network | cross-application | ⚠️ | L3 packets carry no trace context natively; use trace-ID tagging at egress only | +| L4-transport | any | ⚠️ | TCP is not trace-native; trace context begins at mesh or L7 | +| mesh | cross-application | ✅ | Primary trace origin; Envoy injects spans with zero application code changes | +| L7-application | cross-application | ✅ | Primary; W3C `traceparent` on all outbound HTTP/gRPC calls | +| L7-application | multi-tenant | ✅ | `tenant.id` propagated in W3C Baggage on every span | +| L7-application | release | ✅ | `service.version` resource attribute on all spans; canary routing by version | +| L7-application | slo | ⚠️ | Tail sampling must retain error traces within the error budget window; traces are forensic input, not the SLI computation source | + +--- + +## 12. Anti-patterns + +These candidates extend `../anti-patterns.md`. Each pattern breaks trace continuity or introduces compliance risk. + +| Anti-pattern | Consequence | Remediation | +|---|---|---| +| Missing parent spans (orphan spans) | Trace visualizer shows disconnected spans; root cause chain is broken | Verify `traceparent` extraction at every inbound call; check SDK auto-instrumentation coverage | +| `db.query.text` captured with untrimmed PII | Email or phone literal in WHERE clause stored in trace backend; GDPR/PIPA violation | Parameterize all queries; add OTel SDK `db.sanitize_query` option or Collector redaction processor | +| Async messaging without span links | `PRODUCER` and `CONSUMER` spans in separate disconnected traces; cannot follow a message end-to-end | Use OTel SDK `span.addLink()` with the producer span context when creating the consumer span | +| DLQ drain creates a new `trace_id` | Original failure span is orphaned; forensic link to the root cause is lost | Re-inject original `traceparent` from DLQ message headers into the replayed message headers | +| Head-based sampling on multi-service paths | A decision to not sample at the entry service drops all downstream spans, including error spans that only appear deep in the call graph | Use tail-based sampling with 100% error retention; see §7 | +| Kafka consumer lag unmonitored | DLQ silently fills; processing backlog not detected until downstream systems starve | Alert on `kafka_consumer_group_lag` per consumer group; cross-ref `../signals/metrics.md §messaging` | + +--- + +## References + +- OTel Trace API specification: <https://opentelemetry.io/docs/specs/otel/trace/api/> +- OTel Database semconv: <https://opentelemetry.io/docs/specs/semconv/database/> +- OTel Messaging semconv: <https://opentelemetry.io/docs/specs/semconv/messaging/> +- OTel RPC/gRPC semconv: <https://opentelemetry.io/docs/specs/semconv/rpc/grpc/> +- W3C Trace Context Level 1 Recommendation: <https://www.w3.org/TR/trace-context/> +- W3C Baggage Recommendation: <https://www.w3.org/TR/baggage/> +- Kafka producer/consumer instrumentation: <https://kafka.apache.org/documentation/> +- Confluent consumer lag metrics: <https://docs.confluent.io/platform/current/kafka/monitoring.html> diff --git a/.agents/skills/oma-observability/resources/standards.md b/.agents/skills/oma-observability/resources/standards.md new file mode 100644 index 0000000..aff9d7e --- /dev/null +++ b/.agents/skills/oma-observability/resources/standards.md @@ -0,0 +1,215 @@ +--- +otel_spec: "1.x (stable API/SDK)" +otel_semconv: "1.27.0 (2024-11)" +specs: + - "W3C Trace Context: Level 1 Recommendation 2020-02-06; Level 2 Candidate Recommendation" + - "W3C Baggage: Recommendation 2022-12-22" + - "ISO/IEC 25010: 2023; ISO/IEC 27001:2022; ISO/IEC 42010:2011" +notes: + - "Pinned versions (update quarterly or on spec promotion)" +--- + +# Observability Standards Reference + +## 1. Purpose + +This file defines the normative standards baseline for the `oma-observability` skill. It answers: + +- Which specifications govern trace propagation, attribute naming, and signal transport? +- How do ISO quality characteristics map onto observability concerns? +- Which OSI layers are in scope and why? +- What clock discipline is required for trustworthy timestamp ordering? + +All other files in this skill reference this document as the authoritative source. Implementers MUST read this before writing instrumentation, configuring collectors, or authoring SLO policies. + +--- + +## 2. Primary De Facto Standards + +### 2.1 OpenTelemetry (CNCF Incubating) + +Source: <https://opentelemetry.io/docs/specs/otel/> | CNCF: <https://www.cncf.io/projects/opentelemetry/> + +OpenTelemetry is the vendor-neutral, CNCF-incubating specification for telemetry APIs, SDKs, semantic conventions, and the wire protocol OTLP. It is the single de facto standard this skill operates on. + +Four pillars in scope for this skill: + +| Pillar | Specification | Wire protocol | +|--------|---------------|---------------| +| API / SDK | opentelemetry.io/docs/specs/otel/ | — | +| Semantic Conventions (semconv) | opentelemetry.io/docs/specs/semconv/ | — | +| Protocol (OTLP) | opentelemetry.io/docs/specs/otlp/ | gRPC :4317 / HTTP :4318 | +| Collector | opentelemetry.io/docs/collector/ | — | + +Key constraints: +- Use the stable semconv groups listed in Section 3 for production instrumentation. +- OTLP is the mandatory export format; vendor-native formats are acceptable only as secondary sinks. +- The OTel Operator (`v1beta1`) manages Collector and auto-instrumentation CRs in Kubernetes. + +### 2.2 W3C Trace Context + +- Level 1 Recommendation: <https://www.w3.org/TR/trace-context/> (published 2020-02-06) +- Level 2 Candidate Recommendation: <https://www.w3.org/TR/trace-context-2/> + +W3C Trace Context defines the `traceparent` and `tracestate` HTTP headers that establish a distributed trace across service boundaries. Level 1 is the production-stable baseline. Level 2 adds the `traceflags` precision extension and is safe to implement against CR status. + +Mandatory propagation rule: every outbound HTTP/gRPC call MUST forward `traceparent`. Stripping it silently is an anti-pattern. + +Vendor header compatibility reference (informative): +- AWS X-Ray: `X-Amzn-Trace-Id` — compatible bridge via OTel X-Ray propagator +- GCP Cloud Trace: prefers W3C; legacy `X-Cloud-Trace-Context` is supported for backward compat +- Datadog: `X-Datadog-Trace-Id` — use OTel Datadog exporter for bridging + +### 2.3 W3C Baggage + +Recommendation: <https://www.w3.org/TR/baggage/> (2022-12-22) + +W3C Baggage defines the `baggage` HTTP header for key-value pairs propagated alongside a trace. The W3C spec itself contains trust-boundary and PII guidance: baggage is visible to all downstream services and MUST NOT carry secrets, tokens, or personally identifiable information without explicit downstream trust agreement. + +OTel Baggage API reference: <https://opentelemetry.io/docs/specs/otel/baggage/api/> + +Usage rules enforced by this skill: +- Allowed: tenant ID, feature flag state, deployment SHA, region hint. +- Not allowed: user email, session tokens, authentication credentials. +- Trust boundary: strip or validate baggage at ingress gateway before forwarding to external services. + +--- + +## 3. OpenTelemetry Semconv Stability Tiers + +Semantic convention stability determines which attributes can be used in production without risk of breaking changes. Pin the semconv version in the file header above and update on quarterly review. + +Source: <https://opentelemetry.io/docs/specs/semconv/general/attribute-requirement-level/> + +| Tier | Groups | Production use | +|------|--------|----------------| +| Stable | `service.*`, `host.*`, `cloud.*`, `k8s.*`, `http.*`, `db.*` (core), `network.*` (core), `error.*` | Yes, without caveat | +| Release Candidate (RC) | `rpc.*`, gRPC semconv | Yes, expect minor changes | +| Development | `tls.*`, `network.connection.*` | Test environments; production use requires change-tolerance | +| Experimental | `gen_ai.*`, profiles (OTEP 0239 alpha) | Not for production SLOs | + +Notes: +- `network.*` core attributes (e.g., `network.protocol.name`, `network.transport`) are Stable. `network.connection.*` (e.g., `network.connection.type`, `network.connection.subtype`) are Development. +- `tls.*` (all) are Development as of semconv 1.27.0. For TLS deep inspection, use Wireshark or vendor-specific TLS tooling rather than OTel attributes. +- OTEP 0239 (profiling signal) is alpha. Parca and Pyroscope are in production, but the OTel profiling spec is not yet stable. Mark any profiling-related SLOs as experimental. + +Verified sources: +- TLS attrs: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/tls/> +- Network attrs: <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> +- RPC/gRPC: <https://opentelemetry.io/docs/specs/semconv/rpc/grpc/> + +--- + +## 4. ISO/IEC Indirect Mapping + +ISO/IEC 25010:2023 (<https://www.iso.org/standard/78176.html>) defines the Systems and Software Quality Model. There is no dedicated "Observability" quality characteristic in the 2023 edition. Observability concerns map indirectly through three characteristics: + +| ISO/IEC 25010:2023 characteristic | Sub-characteristic | Observability concern | +|---|---|---| +| Maintainability | Analysability | Can operators diagnose system state from telemetry? | +| Security | Accountability | Are actions traceable to an authenticated identity (audit logs)? | +| Reliability | Faultlessness | Does telemetry surface defects before users encounter them? | + +This mapping is informative, not normative. It is useful when presenting observability investment in terms that quality assurance or audit stakeholders recognize. + +Related standards: +- **ISO/IEC 27001:2022 / 27002:2022** (<https://www.iso.org/standard/27001>) — controls for information security management systems; governs log integrity, access control on observability backends, and audit trail requirements. +- **ISO/IEC 42010:2011** (<https://www.iso.org/standard/50508.html>) — architecture description standard. An "observability viewpoint" is a valid architecture viewpoint under 42010 for documenting how stakeholders inspect system internals. + +--- + +## 5. OSI Boundary Decision + +This skill operates on OSI layers L3, L4, mesh (L4-L7 hybrid), and L7. The following table makes the in-scope and out-of-scope decisions explicit. + +### In scope + +| Layer | Label | Coverage | +|-------|-------|----------| +| L3 | Network | IP routing, VPC flow logs, BGP/BMP, ICMP, PMTUD | +| L4 | Transport | TCP retransmits/RTT, eBPF (Beyla/Pixie), QUIC/HTTP3 transport | +| L4-L7 | Service mesh | Istio/Linkerd/Envoy: zero-code instrumentation, mTLS metadata as security context | +| L7 | Application | HTTP/gRPC/WebSocket, RUM (web + mobile), crash analytics, messaging | + +### Out of scope + +| Layer | Label | Reason | Use instead | +|-------|-------|--------|-------------| +| L1 | Physical | SaaS hypervisor hides; no OTel semconv coverage | Vendor DCIM tooling (Nlyte, Sunbird, Device42) | +| L2 | Data Link | Same as L1; SNMP/IPMI are hardware domains | Vendor DCIM tooling; SNMP exporters for Prometheus if needed | +| L5 | Session (full) | gRPC session semantics moved to L7 instrumentation; WebSocket deferred | WebSocket-specific vendor tooling | +| L6 | Presentation (full) | TLS kept as security context attributes only (`tls.*` Development tier); full TLS inspection is not OTel's domain | Wireshark for packet-level TLS; Cloudflare Radar for TLS ecosystem data; vendor-specific TLS inspection tooling | + +Rationale: approximately 90% of production debugging occurs at L3, L4, mesh, and L7. L1/L2 are opaque to SaaS workloads. L5/L6 full coverage requires OTel semconv maturity that does not yet exist (Development tier). This decision is design decision D2 in the design document. + +--- + +## 6. Clock Discipline + +Distributed traces depend on monotonic, synchronized clocks across all nodes. Clock drift corrupts waterfall charts and makes parent-before-child ordering unreliable. + +### Requirements + +- **NTP or chrony** MUST be running and synchronized on all host VMs and container hosts. +- Acceptable drift tolerance for trace timestamp correlation: **< 100 ms** (typical NTP accuracy on well-connected hosts). +- For sub-millisecond precision (financial, telco, or high-frequency workloads): use **PTP (IEEE 1588)** hardware timestamping. + +### Cloud hypervisor time sync notes + +| Cloud | Mechanism | Notes | +|-------|-----------|-------| +| AWS | Chrony + Amazon Time Sync Service (169.254.169.123) | PTP-backed time source; verify with `chronyc tracking` and `chronyc sources -v` | +| GCP | Internal hypervisor clock sync (`metadata.google.internal` via `time.google.com`) | GCP VMs inherit host clock. `timedatectl show` only confirms a sync daemon is active; to verify the actual offset, use `chronyc sources -v` (chrony) or `timedatectl timesync-status` (systemd-timesyncd) | +| Azure | Hyper-V IC timesync primary; external NTP fallback only if Hyper-V IC tools absent | Azure Linux VMs with the Hyper-V Integration Services use host time as the authoritative source. External NTP (for example `time.windows.com`) as a peer source can conflict with IC timesync; treat it as a fallback, not a parallel source. Docs: <https://learn.microsoft.com/azure/virtual-machines/linux/time-sync> | + +### Span timestamp validation rule + +A valid trace satisfies: + +``` +parent_span.start_time <= child_span.start_time +child_span.end_time <= parent_span.end_time +``` + +A violation where `child_span.end_time > parent_span.end_time` is a clock-drift indicator, not necessarily a code bug. Flag these in meta-observability pipeline checks (see `resources/meta-observability.md`). + +### Anti-pattern + +NTP drift left unmonitored is anti-pattern #18 in this skill: waterfall charts appear causally wrong, leading engineers to chase phantom race conditions. Monitor `otelcol_receiver_accepted_spans` and node-level NTP offset metrics together. + +--- + +## 7. Versioning and Review Cadence + +- The spec version block in this file's header MUST be updated when: + - Any listed semconv group promotes from Development → RC or RC → Stable. + - A listed W3C document advances maturity (CR → PR → Recommendation). + - OTel releases a new minor version with breaking semconv changes. +- **Review cadence**: quarterly, aligned with OTel spec releases and CNCF landscape updates (<https://landscape.cncf.io/>). +- When updating, also check and update affected signal/layer files that reference the changed attribute group. +- Skill minor version bump required on any normative change to this file. + +--- + +## 8. References + +Primary sources, in order of precedence: + +1. OpenTelemetry specification — <https://opentelemetry.io/docs/specs/otel/> +2. OTel Semantic Conventions — <https://opentelemetry.io/docs/specs/semconv/> +3. OTLP specification — <https://opentelemetry.io/docs/specs/otlp/> +4. W3C Trace Context L1 Recommendation — <https://www.w3.org/TR/trace-context/> +5. W3C Trace Context L2 Candidate Recommendation — <https://www.w3.org/TR/trace-context-2/> +6. W3C Baggage Recommendation — <https://www.w3.org/TR/baggage/> +7. ISO/IEC 25010:2023 — <https://www.iso.org/standard/78176.html> +8. ISO/IEC 27001:2022 — <https://www.iso.org/standard/27001> +9. ISO/IEC 42010:2011 — <https://www.iso.org/standard/50508.html> +10. CNCF Projects — <https://www.cncf.io/projects/> +11. CNCF Landscape (authoritative vendor registry) — <https://landscape.cncf.io/> +12. OTel TLS attributes registry — <https://opentelemetry.io/docs/specs/semconv/attributes-registry/tls/> +13. OTel network attributes registry — <https://opentelemetry.io/docs/specs/semconv/attributes-registry/network/> +14. OTel RPC/gRPC semconv — <https://opentelemetry.io/docs/specs/semconv/rpc/grpc/> +15. OTel Baggage API — <https://opentelemetry.io/docs/specs/otel/baggage/api/> +16. AWS Time Sync Service — <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/set-time.html> +17. Azure Linux time sync — <https://learn.microsoft.com/azure/virtual-machines/linux/time-sync> +18. OTEP 0239 (profiling signal) — <https://github.com/open-telemetry/oteps/blob/main/text/profiles/0239-profiles-data-model.md> diff --git a/.agents/skills/oma-observability/resources/transport/collector-topology.md b/.agents/skills/oma-observability/resources/transport/collector-topology.md new file mode 100644 index 0000000..11b7fe7 --- /dev/null +++ b/.agents/skills/oma-observability/resources/transport/collector-topology.md @@ -0,0 +1,304 @@ +--- +otel_spec: "1.x (stable API/SDK)" +tools: + - "OTel Operator: v1beta1" + - "Fluent Bit: CNCF Graduated" + - "OTel Collector: v0.122.x" +--- + +# Collector Topology: Deployment Modes and Kubernetes Patterns + + +--- + +## 1. Four Deployment Modes (OTel Operator `Spec.Mode`) + +The OpenTelemetry Operator manages collectors via the `OpenTelemetryCollector` CRD at **v1beta1** (beta stability as of 2025). The `spec.mode` field controls the deployment strategy. + +| Mode | Kubernetes Kind | Replicas | Typical Use Case | +|------|----------------|----------|-----------------| +| `deployment` | Deployment | 1..N | Gateway, tail sampling, aggregation | +| `daemonset` | DaemonSet | 1 per node | Node-level metrics, log collection | +| `statefulset` | StatefulSet | 1..N | Ordered, persistent-volume workloads | +| `sidecar` | injected container | 1 per pod | Serverless (Fargate, Cloud Run), isolation | + +**`Deployment` (default — Gateway use case)** +Runs as a standard Kubernetes Deployment. Suitable for centralized processing: batching, tail sampling, exporters to backend. Scales horizontally but cannot access node-local resources (e.g., `/var/log`, `/proc`). Source: [OTel Collector Deployment docs](https://opentelemetry.io/docs/collector/deployment/). + +**`DaemonSet` (Agent use case — one per node)** +Guarantees exactly one collector pod per node. Required for receivers that must run on the host: `hostmetrics`, `filelog`, `kubeletstats`. Accesses node filesystem and network namespaces directly. + +**`StatefulSet` (specialized)** +Useful when collectors need stable network identity or persistent volumes (e.g., Write-Ahead Log for durability). Uncommon; prefer Deployment unless stable pod identity is mandatory. + +**`Sidecar` (one per pod)** +Injected by the operator via pod annotation (`sidecar.opentelemetry.io/inject: "true"`). Runs in the same pod as the application. See Section 3 for when sidecar is appropriate. + +> CRD stability note: `OpenTelemetryCollector` v1beta1 is beta; `Instrumentation` CRD is also v1beta1. GA graduation is tracked in [opentelemetry-operator](https://github.com/open-telemetry/opentelemetry-operator) releases. + +--- + +## 2. 2026 Recommended Pattern: Two-Tier Hybrid + +The standard production pattern for Kubernetes clusters is a two-tier architecture separating node-level collection from centralized processing. + +``` +App Pod(s) + │ OTLP gRPC/HTTP + ▼ +DaemonSet Agent (1 per node) + - hostmetrics receiver + - filelog receiver + - kubeletstats receiver + - k8sattributes processor (resource enrichment) + │ OTLP gRPC + ▼ +Deployment Gateway (N replicas, behind ClusterIP/LB) + - batch processor + - tail_sampling processor + - loadbalancing exporter (to tier-2 if needed) + - memory_limiter processor + │ OTLP / vendor protocol + ▼ +Observability Backend (e.g., Grafana Cloud, Jaeger, Prometheus) +``` + +**Agent layer responsibilities** (must be per-host): +- `hostmetrics` receiver — CPU, memory, disk, network from `/proc`, `/sys` +- `filelog` receiver — container log files from `/var/log/pods/` +- `k8sattributes` processor — enriches spans/metrics with pod, namespace, node labels +- `kubeletstats` receiver — pod/container resource metrics via Kubelet API + +**Gateway layer responsibilities**: +- `batch` processor — reduce export RPCs, improve compression +- `tail_sampling` processor — evaluate complete traces before sampling decision +- `loadbalancing` exporter — consistent hash routing to tail-sampler replicas +- `memory_limiter` processor — prevent OOM under backpressure + +Example agent `OpenTelemetryCollector` manifest snippet: + +```yaml +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-agent +spec: + mode: daemonset + config: + receivers: + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + memory: {} + filesystem: {} + filelog: + include: [/var/log/pods/*/*/*.log] + include_file_path: true + kubeletstats: + collection_interval: 20s + auth_type: serviceAccount + processors: + k8sattributes: + passthrough: false + extract: + metadata: [k8s.pod.name, k8s.namespace.name, k8s.node.name] + memory_limiter: + limit_mib: 400 + check_interval: 1s + exporters: + otlp: + endpoint: otel-gateway:4317 + service: + pipelines: + metrics: + receivers: [hostmetrics, kubeletstats] + processors: [memory_limiter, k8sattributes] + exporters: [otlp] + logs: + receivers: [filelog] + processors: [memory_limiter, k8sattributes] + exporters: [otlp] +``` + +--- + +## 3. When to Use Sidecar (NOT Default) + +Sidecar mode is **not recommended for standard Kubernetes nodes**. Use it only when DaemonSet is unavailable or per-pod isolation is a hard requirement. + +| Condition | Use Sidecar? | Reason | +|-----------|-------------|--------| +| AWS Fargate | Yes | DaemonSet not supported on Fargate nodes | +| GCP Cloud Run | Yes | No DaemonSet concept; sidecar containers supported | +| Strong per-pod isolation | Yes | Compliance or security boundary per workload | +| Multi-tenant pod separation | Yes | Each tenant's data must not cross pod boundaries | +| Standard Kubernetes nodes | No | N collectors for N pods; cost scales linearly | +| Tail sampling requirement | No | Trace spans split across pods; sampler sees incomplete traces | + +> Warning: Enabling sidecar injection on standard Kubernetes creates one collector instance per application pod. For a 100-pod deployment, this is 100 collectors. CPU and memory costs multiply accordingly. Always prefer DaemonSet on regular nodes. + +--- + +## 4. Collector Component Placement Reference + +Source: [OTel Kubernetes Collector Components](https://opentelemetry.io/docs/platforms/kubernetes/collector/components/) + +| Receiver / Processor | Preferred Mode | Notes | +|---------------------|---------------|-------| +| `kubeletstats` receiver | DaemonSet | Requires access to Kubelet API per node | +| `filelog` receiver | DaemonSet | Reads `/var/log/pods/` on each node | +| `hostmetrics` receiver | DaemonSet | Reads `/proc`, `/sys` — host-only access | +| `k8sattributes` processor | DaemonSet or Gateway | On agent: enriches at source; on gateway: enriches late | +| `prometheus` receiver | Deployment (with caveats) | No automatic horizontal scaling for scrape target distribution; use sharding manually | +| `tail_sampling` processor | Deployment (Gateway) | Requires complete trace; combine with `loadbalancing` exporter | +| `batch` processor | Both | Tuned independently per tier | +| `memory_limiter` processor | Both (mandatory) | Always include; prevents OOM under traffic spikes | + +> Prometheus scraping caveat: When multiple gateway replicas each run `prometheus` receiver, scrape targets are duplicated across replicas causing duplicate metrics. Use target allocator (`spec.targetAllocator`) provided by the OTel Operator to distribute scrape targets across replicas. + +--- + +## 5. Container Runtime Observability Integration + +**containerd and CRI-O** + +Both runtimes expose metrics and events relevant to infrastructure observability: +- Image pull duration and failure rates +- Container start/stop lifecycle events +- Runtime panic or OOM kill events + +Integration approaches: +- `hostmetrics` receiver with `process` scraper captures container process CPU/memory +- containerd exposes metrics via its built-in Prometheus endpoint (default `:1338/metrics`) +- CRI-O exposes metrics via `:9537/metrics`; scrape with `prometheus` receiver on the DaemonSet agent + +**cAdvisor integration** + +cAdvisor runs as part of the Kubelet on each node and exposes per-container CPU, memory, filesystem, and network metrics. + +```yaml +# Scrape cAdvisor via kubeletstats receiver (preferred) +receivers: + kubeletstats: + collection_interval: 20s + auth_type: serviceAccount + endpoint: "https://${env:K8S_NODE_IP}:10250" + insecure_skip_verify: true + metric_groups: [container, pod, node] +``` + +**Pod and container attribute enrichment** + +The `k8sattributes` processor queries the Kubernetes API to attach resource attributes to all telemetry: + +```yaml +processors: + k8sattributes: + auth_type: serviceAccount + passthrough: false + extract: + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.deployment.name + - k8s.namespace.name + - k8s.node.name + - k8s.container.name + labels: + - tag_name: app.label.version + key: app.kubernetes.io/version + from: pod + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip +``` + +--- + +## 6. High-Throughput Gateway Scaling + +For high-volume environments, the gateway tier must scale to avoid becoming a bottleneck. + +**Horizontal scaling with consistent hashing** + +Tail sampling requires that all spans for a given `trace_id` reach the same collector replica. Use `loadbalancing` exporter in a first-tier gateway to route traces consistently: + +```yaml +exporters: + loadbalancing: + protocol: + otlp: + tls: + insecure: true + resolver: + k8s: + service: otel-tail-sampler-headless + ports: [4317] + routing_key: traceID # consistent hash on trace_id +``` + +**Two-tier gateway pattern for tail sampling** + +``` +DaemonSet Agents + │ + ▼ +Gateway Tier-1 (Deployment, 3+ replicas) + loadbalancing exporter → consistent hash by trace_id + │ + ▼ +Gateway Tier-2 (Deployment, 3+ replicas) + tail_sampling processor + batch + memory_limiter + │ + ▼ +Backend +``` + +Tier-1 only routes; Tier-2 owns sampling decisions. This prevents trace fragmentation across replicas. + +--- + +## 7. Federated / Multi-Cluster + +For multi-cluster and multi-region deployments: + +``` +Cluster A (Region US) Cluster B (Region EU) + DaemonSet Agents DaemonSet Agents + │ │ + Regional Gateway Regional Gateway + (edge aggregation) (edge aggregation) + │ │ + └──────────┬────────────────────┘ + ▼ + Central Gateway (single region or multi-region active-active) + │ + Backend +``` + +Key considerations: +- **Cross-cloud egress cost**: Exporting telemetry across cloud providers or regions incurs data transfer charges. Evaluate sampling aggressively at the regional edge to reduce egress volume before forwarding to the central gateway. +- **Network latency**: Tail sampling decision wait (typically 30s) must account for cross-region span arrival delay. Increase `decision_wait` accordingly. +- **Authentication**: Use workload identity / OIDC per cluster; avoid static credentials for cross-cluster OTLP export. + +--- + +## 8. Anti-Patterns + +| Anti-Pattern | Why It Fails | Correct Approach | +|-------------|-------------|-----------------| +| Sidecar collectors on standard Kubernetes nodes | One collector per pod; CPU/memory cost scales with pod count | Use DaemonSet agent per node | +| Tail sampling in sidecar | Sidecar sees only spans from its pod; trace is incomplete across services | Run tail_sampling in gateway tier only | +| Single gateway replica | No high availability; single point of failure under load | Minimum 2-3 replicas; use PodDisruptionBudget | +| Missing `memory_limiter` processor | OOM kill under traffic spikes or backpressure from backend | Always add memory_limiter as first processor in every pipeline | +| Prometheus receiver on multiple replicas without target allocator | Duplicate scrapes = duplicate metrics in backend | Use OTel Operator target allocator for Prometheus receiver | +| Skipping `loadbalancing` exporter before tail sampler | Spans for same trace land on different replicas; sampling decision is wrong | Always pair tail_sampling with loadbalancing exporter upstream | + +## References + +- https://opentelemetry.io/docs/platforms/kubernetes/collector/components/ +- https://github.com/open-telemetry/opentelemetry-operator +- https://opentelemetry.io/docs/collector/deployment/ diff --git a/.agents/skills/oma-observability/resources/transport/otlp-grpc-vs-http.md b/.agents/skills/oma-observability/resources/transport/otlp-grpc-vs-http.md new file mode 100644 index 0000000..cc706ab --- /dev/null +++ b/.agents/skills/oma-observability/resources/transport/otlp-grpc-vs-http.md @@ -0,0 +1,256 @@ +--- +otel_spec: "1.x (stable API/SDK)" +--- + +# OTLP Transport: gRPC vs HTTP Decision Guide + +## 1. OTLP Basics + +The OpenTelemetry Protocol (OTLP) uses a single Protobuf schema for traces, metrics, and logs. +It defines three wire formats over two transports: + +| Transport | Content-Type | Port (default) | +|----------------|------------------------------------|----------------| +| gRPC | `application/grpc` | **4317** | +| HTTP/Protobuf | `application/x-protobuf` | **4318** | +| HTTP/JSON | `application/json` | **4318** | + +All three formats carry identical semantic data. Transport choice is an operational decision. + +Sources: +- OTLP specification: opentelemetry.io/docs/specs/otlp/ +- Collector configuration: opentelemetry.io/docs/collector/configuration/ +- OTLP receiver README: github.com/open-telemetry/opentelemetry-collector (receiver/otlpreceiver) + +--- + +## 2. Decision Tree + +``` +Is the sender a browser or a runtime without gRPC support? + YES → HTTP/protobuf (port 4318) — smallest payload, wide compatibility + HTTP/JSON if debugging or if protobuf encoding unavailable + + NO → Does a corporate proxy or layer-7 firewall sit between sender and collector? + YES → HTTP (port 4318); proxies handle HTTP/1.1 and HTTP/2 cleartext reliably + Confirm with: curl -v http://<collector>:4318/v1/traces + + NO → Is traffic internal to a Kubernetes cluster (pod-to-pod)? + YES → gRPC (port 4317): multiplexed streams, lower per-RPC overhead, + native health checking via grpc_health_probe + Use headless Service or client-side LB (see section 6) + + NO → Is volume high (>50 K spans/sec) or latency critical? + YES → gRPC + gzip compression + NO → Either transport works; prefer gRPC for consistency +``` + +| Axis | Recommended transport | Reason | +|---------------------------|-----------------------|-------------------------------------------------| +| Browser / Electron | HTTP/JSON or HTTP/protobuf | No gRPC-Web support in standard fetch API | +| k8s pod-to-pod | gRPC 4317 | Multiplexing, streaming, native health check | +| Corporate proxy / firewall| HTTP 4318 | Proxies often terminate gRPC streams early | +| VPN with deep inspection | HTTP 4318 | gRPC ALPN negotiation blocked by some DPI | +| High-volume batch | gRPC + gzip | Header compression (HPACK), stream reuse | +| IoT / constrained device | HTTP/JSON | Simpler implementation, no Protobuf required | + +--- + +## 3. Port Conventions + +| Port | Protocol | Notes | +|------|----------|--------------------------------------------------------| +| 4317 | gRPC | IANA-registered; TLS optional (recommended in prod) | +| 4318 | HTTP | IANA-registered; paths `/v1/traces`, `/v1/metrics`, `/v1/logs` | + +Do not expose either port to the public internet without TLS and authentication. + +--- + +## 4. Server-Side Tuning (Collector Receiver) + +### 4.1 gRPC Receiver + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + max_recv_msg_size_mib: 16 # default 4 MiB; raise for large trace batches + max_concurrent_streams: 100 # per-connection stream limit + read_buffer_size: 524288 # 512 KiB per stream + keepalive: + server_parameters: + time: 30s # ping interval when idle + timeout: 5s # wait before closing unresponsive connection + enforcement_policy: + min_time: 10s # reject client pings faster than this + permit_without_stream: true +``` + +### 4.2 HTTP Receiver + +```yaml +receivers: + otlp: + protocols: + http: + endpoint: "0.0.0.0:4318" + max_request_body_size: 16777216 # 16 MiB; matches gRPC max_recv_msg_size_mib + include_metadata: true # forward request headers as resource attributes + cors: + allowed_origins: ["https://app.example.com"] + allowed_headers: ["*"] +``` + +### 4.3 Memory Limiter (Required) + +Place `memory_limiter` first in every pipeline — it back-pressures receivers before OOM. + +```yaml +processors: + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 # refuse new data when usage > limit_mib - spike_limit_mib + + batch: + send_batch_size: 8192 + send_batch_max_size: 16384 + timeout: 5s + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/tier2] +``` + +--- + +## 5. Client-Side Tuning (Exporter) + +### 5.1 Compression + +gzip is the default and should remain enabled (typically 70–85% payload reduction). +Do not disable it unless your TLS offload device double-compresses. + +```yaml +exporters: + otlp: + endpoint: "collector:4317" + compression: gzip # gzip | zstd | none + tls: + insecure: false + ca_file: /etc/ssl/certs/ca.crt +``` + +### 5.2 Sending Queue and Retry + +```yaml +exporters: + otlp: + endpoint: "collector:4317" + compression: gzip + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 5000 + storage: file_storage/otlp_queue # persist queue to disk across restarts + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s # give up after 5 min total +``` + +**OTLP retry semantics** (opentelemetry.io/docs/specs/otlp/): `RESOURCE_EXHAUSTED` returns +`RetryInfo.retry_delay` — honor it. `UNAVAILABLE` → exponential backoff. `INVALID_ARGUMENT` → +permanent failure, do not retry. + +### 5.3 Persistent Queue Extension + +```yaml +extensions: + file_storage/otlp_queue: + directory: /var/otel/queue + timeout: 10s + compaction: + on_start: true + rebound_needed_threshold_mib: 100 +``` + +--- + +## 6. Load Balancing + +gRPC uses long-lived HTTP/2 connections, defeating L4 round-robin. Use client-side LB. + +**Two-tier pattern** (recommended for tail sampling): + +``` +[SDK exporters] + | gRPC 4317 + v +[Tier-1: loadbalancingexporter] ← routes by traceID for consistent routing + | gRPC 4317 + v +[Tier-2: tailsamplingprocessor] ← sees full trace before sampling decision + | OTLP + v +[Backend / storage] +``` + +```yaml +exporters: + loadbalancing: + protocol: + otlp: + tls: + insecure: false + timeout: 5s + resolver: + dns: + hostname: otel-collector-tier2-headless.monitoring.svc.cluster.local + port: 4317 + interval: 5s + timeout: 1s +``` + +For Kubernetes: use a headless Service (`clusterIP: None`) for Tier-2. The `loadbalancingexporter` +resolves DNS and maintains one gRPC connection per pod. + +--- + +## 7. Security: TLS and mTLS + +### Collector Receiver (server-side) + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + tls: + cert_file: /etc/ssl/certs/collector.crt + key_file: /etc/ssl/private/collector.key + client_ca_file: /etc/ssl/certs/ca.crt # enforce mTLS +``` + +### Exporter (client-side) + +```yaml +exporters: + otlp: + endpoint: "collector:4317" + tls: + ca_file: /etc/ssl/certs/ca.crt + cert_file: /etc/ssl/certs/client.crt # mTLS client cert + key_file: /etc/ssl/private/client.key +``` + +For Kubernetes workloads, use cert-manager to auto-rotate TLS credentials (90-day TTL). +Use SPIFFE/SPIRE for zero-touch mTLS in multi-cluster scenarios. diff --git a/.agents/skills/oma-observability/resources/transport/sampling-recipes.md b/.agents/skills/oma-observability/resources/transport/sampling-recipes.md new file mode 100644 index 0000000..402a004 --- /dev/null +++ b/.agents/skills/oma-observability/resources/transport/sampling-recipes.md @@ -0,0 +1,297 @@ +--- +otel_spec: "1.x (stable API/SDK)" +--- + +# Sampling Recipes: Tail-Based, Cost-Aware, and Tenant-Aware + + +--- + +## 1. Sampling Taxonomy + +| Type | Where Decision Is Made | Trace Completeness | Typical Use | +|------|----------------------|-------------------|-------------| +| **Head-based** | At trace start (first span) | May be incomplete — downstream services inherit decision but cannot guarantee it | Simple, low-overhead; works well for single-service or homogeneous traffic | +| **Tail-based** | After all spans arrive (end of trace) | Complete — sampler buffers spans until decision | Production multi-service; required for error/latency policies | +| **Adaptive** | Dynamic rate based on traffic volume | Depends on implementation | Auto-scaling sampling rate under variable load | + +**Head-based limitation**: If a service at hop 2 is sampled out but hop 1 was sampled in (or vice versa), the reconstructed trace is missing spans. This is the core motivation for tail-based sampling in multi-service systems. + +**Tail-based trade-off**: The sampler must buffer all spans for a trace until a decision is made (default `decision_wait: 30s`). Memory usage scales with trace volume and decision wait duration. + +--- + +## 2. Tail-Based Recipe (Recommended for Production) + +The production-standard recipe retains high-signal traces at 100% while keeping a low-rate baseline for ambient visibility. + +**Policy hierarchy** (evaluated in order; first match wins unless using `and` policy): + +1. **100% error retention** — any trace with an error span is always kept +2. **100% cost/latency threshold retention** — traces exceeding a cost or latency threshold are always kept +3. **5-10% baseline** — probabilistic retention of remaining traffic + +This requires the `tail_sampling` processor running in the **gateway tier only**, combined with a `loadbalancing` exporter upstream to ensure trace completeness via consistent hash by `trace_id`. See `collector-topology.md` Section 6. + +```yaml +processors: + tail_sampling: + decision_wait: 30s # wait up to 30s for all spans to arrive + num_traces: 100000 # in-memory trace buffer capacity + expected_new_traces_per_sec: 1000 + policies: + # Policy 1: Always keep traces with errors + - name: keep-errors + type: status_code + status_code: + status_codes: [ERROR] + + # Policy 2: Always keep high-latency traces (p99 threshold) + - name: keep-slow-traces + type: latency + latency: + threshold_ms: 2000 # keep traces with root span > 2s + + # Policy 3: Always keep high-cost LLM traces (see Section 3) + - name: keep-high-cost + type: string_attribute + string_attribute: + key: sampling.keep_reason + values: ["high_cost"] + + # Policy 4: Baseline probabilistic sampling for remaining traces + - name: baseline-sample + type: probabilistic + probabilistic: + sampling_percentage: 8 # 5-10% baseline +``` + +> The `tail_sampling` processor evaluates policies top-to-bottom. A trace matching any policy is kept. Policies do not chain — `probabilistic` at the bottom catches everything not already retained. + +--- + +## 3. Cost-Aware Sampling (LLM / FinOps Context) + +LLM workloads (OpenAI, Anthropic, Bedrock, Vertex AI) attach cost attributes to spans. These should be treated as first-class sampling dimensions. + +**Relevant span attributes** (following GenAI semantic conventions): +- `gen_ai.usage.input_tokens` — prompt token count +- `gen_ai.usage.output_tokens` — completion token count +- `llm.request.cost_usd` — estimated cost if pre-computed by SDK +- `gen_ai.cost.total_usd` — total cost attribute (custom, team-defined) + +**Strategy**: Set a cost threshold (e.g., $0.50 per trace). Any trace with cumulative LLM cost above the threshold is always retained for cost attribution and FinOps analysis. + +Because `tail_sampling` does not natively support numeric comparisons on span attributes, use a transform processor to annotate high-cost traces before sampling: + +```yaml +processors: + transform/cost_annotation: + trace_statements: + - context: span + statements: + # If cost attribute exceeds threshold, mark the trace for retention + - set(attributes["sampling.keep_reason"], "high_cost") + where attributes["llm.request.cost_usd"] > 0.50 + - set(attributes["sampling.keep_reason"], "high_cost") + where attributes["gen_ai.cost.total_usd"] > 0.50 + + tail_sampling: + decision_wait: 30s + policies: + - name: keep-high-cost + type: string_attribute + string_attribute: + key: sampling.keep_reason + values: ["high_cost"] + - name: keep-errors + type: status_code + status_code: + status_codes: [ERROR] + - name: baseline + type: probabilistic + probabilistic: + sampling_percentage: 8 + +service: + pipelines: + traces: + processors: [transform/cost_annotation, tail_sampling] +``` + +--- + +## 4. Tenant-Aware Sampling (Multi-Tenant B2B SaaS) + +Different customer tiers have different observability value. Enterprise customers justify full retention; free tier customers do not. + +**Per-tier retention targets**: + +| Tenant Tier | Retention Rate | Rationale | +|------------|---------------|-----------| +| `enterprise` | 100% | SLA obligations, full debugging capability | +| `pro` | 20% | Representative sample, cost-controlled | +| `free` | 2% | Ambient visibility only | + +**Option A: `routing_connector` for per-tenant pipeline branching** + +The `routing_connector` routes telemetry to different pipelines based on attribute values, allowing different `tail_sampling` configurations per tenant tier. + +> Alpha stability warning: `routing_connector` is in **alpha** as of 2025. The API and behavior may change in minor releases. Do not use in production if pipeline stability is required. Prefer Option B for production workloads. + +```yaml +connectors: + routing: + default_pipelines: [traces/pro] # fallback if no rule matches + error_mode: ignore + table: + - statement: route() where attributes["tenant.tier"] == "enterprise" + pipelines: [traces/enterprise] + - statement: route() where attributes["tenant.tier"] == "free" + pipelines: [traces/free] + +service: + pipelines: + traces/input: + receivers: [otlp] + exporters: [routing] + + traces/enterprise: + receivers: [routing] + processors: [tail_sampling/enterprise] + exporters: [otlp/backend] + + traces/pro: + receivers: [routing] + processors: [tail_sampling/pro] + exporters: [otlp/backend] + + traces/free: + receivers: [routing] + processors: [tail_sampling/free] + exporters: [otlp/backend] +``` + +**Option B: `tail_sampling` with per-tier policies (stable, recommended for production)** + +Use composite `and` policies combining tenant tier with probabilistic sampling: + +```yaml +processors: + tail_sampling: + decision_wait: 30s + num_traces: 100000 + policies: + # Enterprise: always keep + - name: enterprise-full + type: and + and: + and_sub_policy: + - name: is-enterprise + type: string_attribute + string_attribute: + key: tenant.tier + values: ["enterprise"] + - name: keep-all + type: probabilistic + probabilistic: + sampling_percentage: 100 + + # Pro: 20% + - name: pro-sample + type: and + and: + and_sub_policy: + - name: is-pro + type: string_attribute + string_attribute: + key: tenant.tier + values: ["pro"] + - name: pro-rate + type: probabilistic + probabilistic: + sampling_percentage: 20 + + # Free: 2% + - name: free-sample + type: probabilistic + probabilistic: + sampling_percentage: 2 # baseline for unmatched / free tier +``` + +--- + +## 5. Complete Example: Four-Policy `tail_sampling` + +This YAML combines error, latency, cost, and baseline policies into a single production-ready configuration: + +```yaml +processors: + memory_limiter: + check_interval: 1s + limit_mib: 1500 + spike_limit_mib: 400 + + transform/cost_annotation: + trace_statements: + - context: span + statements: + - set(attributes["sampling.keep_reason"], "high_cost") + where attributes["llm.request.cost_usd"] > 0.50 + + tail_sampling: + decision_wait: 30s + num_traces: 200000 + expected_new_traces_per_sec: 2000 + policies: + # Policy 1: 100% error traces + - name: policy-errors + type: status_code + status_code: + status_codes: [ERROR] + + # Policy 2: 100% high-latency traces (>2s root span) + - name: policy-latency + type: latency + latency: + threshold_ms: 2000 + + # Policy 3: 100% high-cost LLM traces (>$0.50) + - name: policy-high-cost + type: string_attribute + string_attribute: + key: sampling.keep_reason + values: ["high_cost"] + + # Policy 4: 8% baseline probabilistic sampling + - name: policy-baseline + type: probabilistic + probabilistic: + sampling_percentage: 8 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, transform/cost_annotation, tail_sampling] + exporters: [otlp/backend] +``` + +--- + +## 6. Pitfalls + +| Pitfall | Description | Mitigation | +|---------|-------------|-----------| +| Head-based sampling on multi-service paths | A service sampled out at hop 2 drops spans; trace is reconstructed with gaps | Use tail-based sampling in gateway; propagate `traceparent` regardless of local sampling decision | +| Tail sampling memory exhaustion | Buffer holds all in-flight spans per trace for `decision_wait` duration; high RPS + long wait = large heap | Set `num_traces` based on `expected_new_traces_per_sec * decision_wait`; always include `memory_limiter` | +| Decision wait too short | Spans from slow downstream services arrive after decision is made; those spans are dropped | Set `decision_wait` to exceed your p99 inter-service latency; typically 30-60s | +| Sample-rate mismatch between gateway tiers | Tier-1 and Tier-2 each apply independent sampling; effective rate is multiplied (e.g., 50% × 20% = 10%) | Assign sampling responsibility to one tier only; the other tier passes all traffic through | +| `routing_connector` in production (alpha) | Alpha components may break on minor version upgrades of collector-contrib | Use `tail_sampling` with `and` sub-policies (stable) for production; evaluate `routing_connector` only in staging | +| Missing `loadbalancing` exporter before tail sampler | Spans for the same trace land on different gateway replicas; sampler on each replica sees an incomplete trace and makes wrong decisions | Always deploy `loadbalancing` exporter (routing_key: traceID) in the tier upstream of `tail_sampling` | + +## References + +- https://opentelemetry.io/docs/specs/otlp/ +- https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor +- https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/routingconnector diff --git a/.agents/skills/oma-observability/resources/transport/udp-statsd-mtu.md b/.agents/skills/oma-observability/resources/transport/udp-statsd-mtu.md new file mode 100644 index 0000000..a82bb60 --- /dev/null +++ b/.agents/skills/oma-observability/resources/transport/udp-statsd-mtu.md @@ -0,0 +1,167 @@ +--- +otel_spec: "1.x (stable API/SDK)" +--- + +# UDP StatsD MTU and Fragmentation Guide + +## 1. Scope + +This document covers StatsD metric payloads transported over UDP or Unix Domain Socket (UDS) +into an OpenTelemetry Collector `statsdreceiver` or a DogStatsD agent. It does not address +TCP or HTTP transports, or metrics emitted directly via OTLP. + +Sources: +- OTel statsdreceiver: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/statsdreceiver +- DogStatsD high-throughput: docs.datadoghq.com/developers/dogstatsd/high_throughput/ +- Prometheus statsd_exporter issue #35 (buffer sizing) + +--- + +## 2. Why Fragmentation Matters for UDP + +UDP is connectionless and provides no retransmission. When an IP datagram exceeds the path MTU: + +- **No retransmission**: if any IP fragment is lost in transit, the kernel discards the entire + reassembled datagram silently. StatsD has no acknowledgement mechanism, so the loss is invisible. +- **Middlebox drops**: NAT gateways, stateful firewalls, and VPN concentrators frequently drop + IP fragments entirely, or track only the first fragment for NAT state and discard the rest. +- **Reassembly CPU overhead**: fragment reassembly at the receiver incurs kernel memory allocation + and timer management. Under high packet rate this degrades throughput measurably. + +**Rule**: keep each datagram within the path MTU so it is transmitted as a single unfragmented packet. + +--- + +## 3. Optimal Datagram Size Table + +| Network path | Effective MTU | Max UDP payload | Notes | +|----------------------------------|---------------|-----------------|---------------------------------------------| +| External network, IPv4 Ethernet | 1500 B | **1472 B** | 1500 − 20 (IP) − 8 (UDP) | +| External network, IPv6 Ethernet | 1500 B | **1452 B** | 1500 − 40 (IPv6) − 8 (UDP) | +| VPN / PPPoE encapsulation | ~1460 B | **1432 B** | Conservative; accounts for tunnel overhead | +| Same-host loopback (Linux/macOS) | ~65535 B | **~16384 B** | Kernel loopback; practical limit ~16 K | +| Unix Domain Socket (`unixgram`) | N/A | **8192 B** | Recommended; avoids kernel socket buffer pressure | + +> IPv6 minimum MTU is 1280 B (RFC 8200); on links with lower MTU, fragmentation is performed +> by the source host only — not routers — making drops more likely. Always test your actual path. + +--- + +## 4. Verification + +### 4.1 PMTUD Probe (Linux / macOS) + +Find the largest datagram that reaches a destination without fragmentation: + +```bash +# Linux — DF bit set, vary -s until you find the boundary +ping -M do -s 1472 -c 3 <collector-host> + +# macOS equivalent +ping -D -s 1472 -c 3 <collector-host> +``` + +Decrease `-s` in steps of 10 until you stop seeing "Frag needed" / "Message too long" errors. +The largest passing value is your PMTU; set `max_packet_size` to that value minus 28 (IPv4) +or 48 (IPv6) to leave room for IP and UDP headers. + +### 4.2 Wireshark Fragment Inspection + +Capture on the collector interface and filter for IP fragments: + +``` +ip.flags.mf == 1 +``` + +The **More Fragments (MF)** bit set on any packet indicates fragmentation is occurring. +A fragment offset > 0 identifies continuation fragments. If you see these in production +traffic, your client is sending oversized datagrams. + +For UDS traffic, capture on a loopback or use `socat` to proxy and inspect: + +```bash +socat -v UNIX-RECV:/tmp/statsd.sock UDP:127.0.0.1:8126 +``` + +--- + +## 5. Client-Side Batching + +Most StatsD client libraries default to sending **one metric per datagram**, which is wasteful: + +- A single counter line (`my.counter:1|c`) is ~20 bytes; one UDP send per metric at 1 M metrics/min + generates 16 K packets/sec of syscall overhead. +- Enable multi-metric batching in your client library. + +| Client library | Batching config | Recommended buffer | +|--------------------|------------------------------------------|----------------------| +| dogstatsd-go | `WithMaxMessagesPerPayload(N)` | 1472 (external UDP) | +| statsd (npm) | `maxBufferSize: 1432` | 1432 (VPN/default) | +| statsd-client (py) | `maxudpsize=1432` | 1432 | +| Any UDS client | Set buffer ≤ 8192 | 8192 | + +Rules: +- For external UDP: keep buffered payload at or below the PMTU value found in section 4.1. +- For UDS (same-host): buffer up to 8192 bytes per send; larger values risk ENOBUFS under load. +- Never exceed 65507 bytes (IPv4 UDP maximum); the kernel will return EMSGSIZE. + +--- + +## 6. OTel StatsD Receiver Configuration + +Default UDP port is 8125. Switch to UDS for same-host agents to eliminate network stack overhead. + +```yaml +receivers: + statsd: + endpoint: "0.0.0.0:8125" # UDP; change to unixgram path for UDS + # endpoint: "/var/run/statsd.sock" + transport: udp # udp | unixgram + aggregation_interval: 60s # flush interval to next processor + enable_metric_type: true # attach metric type as attribute + is_monotonic_counter: false # set true for always-increasing counters + timer_histogram_mapping: + - statsd_type: "timing" + observer_type: "histogram" + - statsd_type: "histogram" + observer_type: "histogram" + +processors: + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + batch: + send_batch_size: 8192 + timeout: 10s + +exporters: + otlp: + endpoint: "otel-collector-tier2:4317" + compression: gzip + +service: + pipelines: + metrics: + receivers: [statsd] + processors: [memory_limiter, batch] + exporters: [otlp] +``` + +**Bridge pattern**: `StatsD clients` → UDP/UDS → `OTel Collector statsdreceiver` → OTLP gRPC → +downstream Collector or backend. This decouples metric ingestion from processing and allows the +statsdreceiver to be scaled or replaced without changing client configuration. + +For UDS on Linux, set socket permissions to allow the application user: + +```bash +chmod 0660 /var/run/statsd.sock +chown root:app /var/run/statsd.sock +``` + +Set `SO_RCVBUF` on the collector socket to handle burst traffic (Linux default is typically 208 KB; +increase to 8–25 MB for high-throughput agents via `net.core.rmem_max`): + +```bash +sysctl -w net.core.rmem_max=26214400 +``` diff --git a/.agents/skills/oma-observability/resources/vendor-categories.md b/.agents/skills/oma-observability/resources/vendor-categories.md new file mode 100644 index 0000000..9623bea --- /dev/null +++ b/.agents/skills/oma-observability/resources/vendor-categories.md @@ -0,0 +1,352 @@ +--- +notes: + - "Example vendor lists are snapshots — verify via landscape.cncf.io before production selection" +--- + +# Observability Vendor Categories + +> as of 2026-Q2 — review quarterly. Verify live status at https://landscape.cncf.io + +## Preamble — Why Categories, Not a Registry + +This file is a **category taxonomy with timestamped example vendors**. It is not a vendor registry. + +**Why this distinction matters:** + +1. **Vendor names rot.** Examples from this codebase's own lifetime: + - Keptn — archived by CNCF, 2025-09 + - Fluentd — deprecated by CNCF, 2025-10 (migration guide: Fluent Bit / OTel Collector) + - Pyroscope — was CNCF Sandbox; acquired by Grafana 2023; CNCF Sandbox status is uncertain post-acquisition (verify at landscape.cncf.io before citing) + +2. **Categories are stable.** "OSS full-stack", "SIEM", and "profiling specialist" have been coherent for years. The vendors filling them change. + +3. **No duplication with vendor-owned skills.** The following published skills already describe themselves authoritatively: + - `getsentry/sentry-sdk-setup` — Sentry SDK instrumentation + - `honeycombio/agent-skill` — Honeycomb OTel setup (8+ published skills) + - `Dash0 otel-instrumentation` — Dash0 OTel + - `Microsoft Azure Monitor exporters` — Azure-specific exporters + - `Datadog Labs dd-apm` — Datadog APM + Duplicating their content here creates drift, not value. + +4. **CNCF landscape is the authoritative live registry.** `landscape.cncf.io` tracks CNCF project status (Graduated / Incubating / Sandbox / Archived), vendor landscape, and category membership in real time. This file complements it with decision guidance; it does not copy or compete with it. + +## Timestamp Discipline + +All example vendor lists in this file are marked `as of 2026-Q2`. + +**Review cadence:** quarterly (aligned with OTel spec release cadence). + +When reviewing: +- Check CNCF project status changes at landscape.cncf.io +- Check acquisition or archival announcements +- Update the `as of YYYY-QX` markers on any changed sections + +--- + +## Category Taxonomy + +### (a) OSS Full-Stack + +**Traits:** Self-hostable, covers metrics + logs + traces in one coherent stack, often includes UI, storage, and agents. No per-seat licensing. Operational burden is on the team. + +**Example vendors** (as of 2026-Q2): + +| Vendor | Metrics | Logs | Traces | Profiles | UI | Notes | +|--------|---------|------|--------|----------|----|-------| +| Grafana Labs LGTM+ | Mimir | Loki | Tempo | Pyroscope | Grafana | Alloy collector, Beyla eBPF auto-instr, Faro RUM, k6 load | +| Elastic Stack (ELK) | Yes | Elasticsearch | APM | No | Kibana | Mature; storage costs at scale | +| SigNoz | Yes | Yes | Yes | No | Yes | OTel-native from day 1; ClickHouse backend | +| OpenObserve | Yes | Yes | Yes | No | Yes | Rust-based; lower storage footprint claim | + +**How to choose:** +- Already using Grafana dashboards → Grafana LGTM+ +- Need full-text log search maturity → Elastic Stack +- Want OTel-native with no legacy shim → SigNoz +- Cost-sensitive storage + small team → OpenObserve or SigNoz + +**Delegation target:** For Grafana stack setup, invoke `oma-search --docs "Grafana LGTM+ OTel Collector setup"`. For Elastic, use Elastic documentation directly. No single vendor-owned skill covers the full OSS stack; delegate to `oma-search` unless a specific component skill is installed. + +--- + +### (b) Commercial SaaS Unified APM + +**Traits:** Managed SaaS; covers metrics, logs, traces, and often RUM/synthetics/profiling in one product. Per-host or per-ingestion pricing. Reduced operational burden; vendor lock-in risk. + +**Example vendors** (as of 2026-Q2): + +| Vendor | Metrics | Logs | Traces | Profiles | RUM | Notes | +|--------|---------|------|--------|----------|-----|-------| +| Datadog | Yes | Yes | Yes | Yes | Yes | Broadest feature surface; highest cost at scale | +| New Relic | Yes | Yes | Yes | Yes | Yes | Usage-based pricing; OTel-native ingest | +| Dynatrace | Yes | Yes | Yes | Yes | Yes | AI-driven auto-discovery; OneAgent proprietary | +| Sentry | Partial | Yes | Yes | No | Yes | Error-first; strong release tracking | +| Grafana Cloud | Yes | Yes | Yes | Yes | Faro | Managed LGTM+; per-signal pricing | + +**How to choose:** +- Maximum feature coverage, budget flexible → Datadog +- OTel-native ingest preferred, usage pricing → New Relic +- Auto-discovery in complex microservice env → Dynatrace +- Error tracking + release correlation primary need → Sentry +- Prefer OSS tools managed → Grafana Cloud + +**Delegation target:** `Datadog Labs dd-apm`, `getsentry/sentry-sdk-setup`. For New Relic / Dynatrace: `oma-search --docs "{vendor} OTel integration setup"`. + +--- + +### (c) High-Cardinality Specialist + +**Traits:** Purpose-built for high-cardinality event-based observability. Avoids pre-aggregation. Enables arbitrary dimension slicing at query time. Dynamic sampling to control cost. + +**Example vendors** (as of 2026-Q2): + +| Vendor | Approach | Key features | Notes | +|--------|----------|-------------|-------| +| Honeycomb | Event-first, columnar | BubbleUp root-cause, dynamic sampling, Query Builder | OTel-native; no metrics tier (use separate tool) | + +**How to choose:** +- Cardinality explosion on Prometheus/Datadog metrics → Honeycomb for traces/events +- Need to slice by any arbitrary field post-hoc → Honeycomb +- Metrics + high-cardinality in one product → pair Honeycomb with a TSDB (category i) + +**Delegation target:** `honeycombio/agent-skill` (8+ published skills for OTel setup, sampling config, and BubbleUp usage). + +--- + +### (d) Profiling Specialist + +**Traits:** Continuous profiling (CPU, memory, goroutine, heap). Always-on, low-overhead. pprof or OTel profiling (OTEP 0239, currently alpha) as wire format. MELT+P fifth pillar. + +**Note on Pyroscope:** Originally CNCF Sandbox. Acquired by Grafana Labs in 2023. CNCF Sandbox status is uncertain post-acquisition — verify at landscape.cncf.io before citing CNCF affiliation. + +**Example vendors** (as of 2026-Q2): + +| Vendor | CNCF status | Backend | Notes | +|--------|------------|---------|-------| +| Parca | CNCF Sandbox | ClickHouse / S3 | OTEP 0239-aligned; open-source | +| Grafana Pyroscope | See note above | Object storage | Integrated into Grafana LGTM+; formerly CNCF Sandbox | +| Polar Signals Cloud | Commercial | Polar Signals | Enterprise managed Parca | + +**How to choose:** +- Full OSS control → Parca +- Already on Grafana LGTM+ → Grafana Pyroscope (co-located storage) +- Managed + enterprise support → Polar Signals Cloud + +**Delegation target:** `oma-search --docs "Parca OTel profiling setup"` or `oma-search --docs "Grafana Pyroscope integration"`. + +--- + +### (e) SIEM / Enterprise Logs + +**Traits:** Security information and event management. Compliance-grade log retention, correlation rules, threat detection, alerting. Often WORM / immutable storage. Targets SOC2 / ISO 27001 audit requirements. + +**Example vendors** (as of 2026-Q2): + +| Vendor | Log ingest | Threat detection | Compliance | Notes | +|--------|-----------|-----------------|-----------|-------| +| Splunk | Yes | Yes | Yes | Market leader; high licensing cost | +| Elastic Security | Yes | Yes | Yes | Elastic Stack + SIEM rules; lower cost than Splunk | +| Sumo Logic | Yes | Yes | Yes | Cloud-native; per-GB pricing | +| Datadog Cloud SIEM | Yes | Yes | Partial | Add-on to Datadog unified APM | + +**How to choose:** +- Enterprise compliance, existing Splunk investment → Splunk +- Cost-sensitive, already on Elastic → Elastic Security +- Cloud-native, no on-prem requirement → Sumo Logic +- Already on Datadog, want unified → Datadog Cloud SIEM + +**Delegation target:** `oma-search --docs "{vendor} SIEM log pipeline setup"`. For audit-specific requirements see `signals/audit.md` in this skill. + +--- + +### (f) FinOps / Cost Observability + +**Traits:** Kubernetes / cloud cost visibility, unit economics, per-tenant / per-namespace attribution. Cost is a first-class observability signal (D4 in design), not a boundary attribute. Distinct from APM cost-of-ownership analysis. + +**Example vendors** (as of 2026-Q2): + +| Vendor | CNCF status | K8s cost | Cloud cost | Notes | +|--------|------------|---------|-----------|-------| +| OpenCost | CNCF Incubating | Yes | AWS/GCP/Azure | Open spec + OSS implementation; Prometheus-native | +| Kubecost | Commercial (OpenCost-based) | Yes | Yes | Adds multi-cluster, RBAC, Slack alerts | +| CloudZero | Commercial SaaS | Partial | Yes | Business-unit cost allocation focus | + +**How to choose:** +- OSS, Prometheus-integrated, per-namespace attribution → OpenCost +- Multi-cluster + enterprise reporting → Kubecost +- Cloud bill decomposed by product team → CloudZero + +**Delegation target:** `oma-search --docs "OpenCost Prometheus integration"`. For cost signal context see `signals/cost.md` in this skill. + +--- + +### (g) Feature Flags / Progressive Delivery + +**Traits:** Controlled rollout of features to subsets of users or traffic. Release observability — correlate flag states with error rates and latency. GitOps-integrated progressive delivery. + +**Example vendors** (as of 2026-Q2): + +| Vendor | CNCF status | Flag eval | Progressive delivery | Notes | +|--------|------------|---------|---------------------|-------| +| OpenFeature | CNCF Graduated (2024-11) | Yes (SDK standard) | No | Standardizes flag SDK; vendor-agnostic | +| Flagger | CNCF Graduated | No | Yes (canary/A-B/blue-green) | Prometheus/Datadog metric gating | +| Argo Rollouts | N/A (Argo project) | No | Yes (canary/blue-green) | Kubernetes CRD; integrates with analysis templates | +| LaunchDarkly | Commercial | Yes | Partial | Mature commercial flag platform | +| Unleash | OSS + Commercial | Yes | No | Self-hostable flag platform | + +**How to choose:** +- Standardize flag SDK across vendors → OpenFeature (pair with LaunchDarkly or Unleash as backend) +- Automated canary promotion with metric gates → Flagger or Argo Rollouts +- Per-tenant progressive rollout (feature flags) → LaunchDarkly or Unleash +- GitOps-native canary on Kubernetes → Flagger (CNCF Graduated, Flux / Helm native) + +**Delegation target:** `oma-search --docs "OpenFeature SDK {runtime} setup"` or `oma-search --docs "Flagger canary analysis Prometheus"`. For release boundary observability see `boundaries/release.md` in this skill. + +--- + +### (h) Log Pipeline / Collection + +**Traits:** High-throughput log and event collection, transformation, routing, and forwarding. Runs as DaemonSet agents (preferred) or standalone pipeline. OTel Collector is the 2026 standard for new deployments. + +**Fluentd note:** Deprecated by CNCF 2025-10. Official migration guide: CNCF Blog 2025-10-01 "Fluentd to Fluent Bit: A Migration Guide". For migration assistance, use intent `migrate` in this skill, target category (h), with CNCF 2025-10 guide as reference. + +**Example vendors** (as of 2026-Q2): + +| Vendor | CNCF status | Protocol support | Transform | Notes | +|--------|------------|----------------|---------|-------| +| Fluent Bit | CNCF Graduated | Multiple | Lua / built-in | Preferred Fluentd replacement; C, low memory | +| OpenTelemetry Collector | CNCF Incubating | OTLP + many | processors | OTel-native; recommended for new deployments | +| Vector (Datadog OSS) | Non-CNCF | Multiple | VRL | Rust; high throughput; Datadog-backed | +| Cribl Stream | Commercial | Multiple | Yes | Enterprise routing + data shaping | + +**How to choose:** +- Migrating from Fluentd → Fluent Bit (drop-in config compatibility) +- New OTel-native deployment → OpenTelemetry Collector +- High-throughput with Rust reliability → Vector +- Enterprise data routing with UI + compliance → Cribl Stream + +**Delegation target:** `oma-search --docs "Fluent Bit Kubernetes DaemonSet setup"` or `oma-search --docs "OTel Collector configuration {backend}"`. For collector topology guidance see `transport/collector-topology.md` in this skill. + +--- + +### (i) Time Series Storage / Long-term Metrics + +**Traits:** Purpose-built for metrics storage at scale. Prometheus-compatible query (PromQL). Long-term retention beyond Prometheus 15-day default. HA + multi-tenancy for production. + +**Example vendors** (as of 2026-Q2): + +| Vendor | CNCF status | Multi-tenant | Object storage | Notes | +|--------|------------|------------|---------------|-------| +| Prometheus | CNCF Graduated | No (single-tenant) | No | Standard scrape source; 15d retention default | +| Thanos | CNCF Graduated | Yes | Yes (S3/GCS/Azure) | Sidecar or receive mode; global query view | +| Cortex | CNCF Incubating | Yes | Yes | Horizontally scalable; more complex ops | +| Grafana Mimir | Non-CNCF (Grafana OSS) | Yes | Yes | Evolved from Cortex; simpler ops | +| VictoriaMetrics | Non-CNCF (OSS + Commercial) | Cluster edition | Yes | High compression; low resource usage | +| InfluxDB | Non-CNCF (Commercial OSS) | Yes (Cloud) | Yes | Line protocol; IOx rewrite in Rust | + +**How to choose:** +- Starting fresh, Kubernetes, team familiar with Grafana → Grafana Mimir +- Need CNCF Graduated status for compliance → Thanos +- Multi-tenant with complex federation needs → Cortex +- Resource-constrained or high compression priority → VictoriaMetrics +- Time-series with SQL query surface needed → InfluxDB IOx + +**Delegation target:** `oma-search --docs "Thanos sidecar Prometheus setup"` or `oma-search --docs "Grafana Mimir distributed mode"`. + +--- + +### (j) Crash Analytics (Mobile-Heavy) + +**Traits:** Symbolication of native crash stacks. Crash-free rate (CFR) tracking. Release-correlated crash trends. ANR (Application Not Responding) detection. Mobile session replay context. + +**Example vendors** (as of 2026-Q2): + +| Vendor | iOS | Android | Web | Notes | +|--------|-----|---------|-----|-------| +| Firebase Crashlytics | Yes | Yes | No | Free; Google ecosystem; dSYM auto-upload | +| Sentry | Yes | Yes | Yes | Cross-platform; OTel trace correlation | +| Bugsnag | Yes | Yes | Yes | Strong breadcrumb API; Smartisan noise reduction | +| Embrace | Yes | Yes | No | Mobile-first; session replay; network body capture | +| Datadog Error Tracking | Yes | Yes | Yes | Unified with Datadog APM | + +**How to choose:** +- Mobile-only, budget zero → Firebase Crashlytics +- Cross-platform (mobile + web + backend) correlation → Sentry +- Mobile session replay + user journey → Embrace +- Already on Datadog APM → Datadog Error Tracking +- Need noise reduction + grouping quality → Bugsnag + +**Delegation target:** `getsentry/sentry-sdk-setup` for Sentry. For others: `oma-search --docs "{vendor} iOS/Android crash reporting setup"`. For crash analytics context see `layers/L7-application/crash-analytics.md` in this skill. + +--- + +## Intent to Category Routing + +| User intent | Primary categories | Notes | +|------------|-------------------|-------| +| Setup full-stack OSS monitoring | (a) OSS Full-Stack, (h) Log Pipeline, (i) TSDB, (d) Profiling | Start with (a); (h)+(i)+(d) fill gaps | +| Setup commercial managed APM | (b) Commercial SaaS Unified APM | Select vendor by criteria in (b) | +| Migrate off Fluentd | (h) Log Pipeline | Fluent Bit preferred (CNCF 2025-10 guide); OTel Collector for OTel-native | +| High-cardinality trace investigation | (c) High-Cardinality Specialist | Honeycomb + dynamic sampling | +| FinOps cost attribution program | (f) FinOps / Cost Observability | OpenCost + `signals/cost.md` | +| Per-tenant progressive rollout | (g) Feature Flags / Progressive Delivery | OpenFeature SDK + Flagger or Argo Rollouts | +| Mobile crash rate SLO | (j) Crash Analytics | Pair with (b) or (a) for backend traces | +| Long-term metric retention | (i) TSDB | Thanos or Mimir on top of Prometheus | +| Continuous profiling (MELT+P) | (d) Profiling Specialist | Parca (OSS) or Pyroscope (Grafana) | +| SIEM / SOC2 audit log compliance | (e) SIEM / Enterprise Logs | See `signals/audit.md` for WORM requirements | + +--- + +## How to Delegate + +### When a vendor-specific skill is installed + +Invoke the skill directly: + +| Skill | Coverage | +|-------|---------| +| `getsentry/sentry-sdk-setup` | Sentry SDK instrumentation, error tracking, release tracking | +| `honeycombio/agent-skill` | Honeycomb OTel setup, BubbleUp, dynamic sampling (8+ published skills) | +| `Dash0 otel-instrumentation` | Dash0 OTel generic instrumentation | +| `Microsoft Azure Monitor exporters` | Azure-specific OTel exporters and Monitor integration | +| `Datadog Labs dd-apm` | Datadog APM, log correlation, distributed tracing | + +### When no vendor-specific skill is installed + +Route to search with vendor + observability context: + +``` +oma-search --docs "{vendor} observability setup" +oma-search --docs "{vendor} OTel integration {runtime}" +oma-search --docs "{vendor} {signal} configuration" +``` + +Examples: +- `oma-search --docs "Grafana Mimir Prometheus remote_write setup"` +- `oma-search --docs "OpenCost Kubernetes namespace cost attribution"` +- `oma-search --docs "Fluent Bit Kubernetes DaemonSet migration from Fluentd"` +- `oma-search --docs "Parca continuous profiling Go gRPC"` + +### CNCF landscape as canonical vendor source + +For questions about current CNCF project status, vendor landscape membership, or whether a project is still maintained: + +``` +oma-search --docs "site:landscape.cncf.io {category or vendor}" +``` + +Or navigate directly to https://landscape.cncf.io — it is the authoritative live registry. This file does not attempt to replicate it. + +--- + +## Footer + +**Timestamp:** as of 2026-Q2 + +**Review cadence:** Quarterly. Triggered by: CNCF project status changes, acquisitions, deprecations, or new CNCF Graduated/Incubating entrants in covered categories. + +**Authoritative live source:** https://landscape.cncf.io + +**Known pending verifications (as of 2026-Q2):** +- Pyroscope CNCF status post-Grafana acquisition (2023) — check landscape.cncf.io before citing CNCF affiliation +- Thanos Graduated status (confirmed 2024 per design verification log — verify remains current) +- OpenFeature CNCF status timestamped as of 2024-11 Graduated — verify next tier/changes via landscape.cncf.io diff --git a/.agents/skills/oma-scholar/SKILL.md b/.agents/skills/oma-scholar/SKILL.md new file mode 100644 index 0000000..fe6af8f --- /dev/null +++ b/.agents/skills/oma-scholar/SKILL.md @@ -0,0 +1,177 @@ +--- +name: oma-scholar +description: > + Scholarly research companion using Knows sidecar spec (.knows.yaml). Generates, + validates, reviews, queries, and compares structured research-paper sidecars, + and fetches them from knows.academy. Use for academic literature search, survey + synthesis, paper authoring assistance, and peer review with token-efficient + claim/evidence/relation access. +--- + +# Scholar - Research Paper Sidecar Companion + +## When to use + +- Reading research papers token-efficiently via Knows sidecars (~700 tokens for claims-only vs ~10K for full PDF) +- Generating `.knows.yaml` sidecars from your own paper drafts, LaTeX, or research notes +- Validating sidecar structure (rule-based) before sharing +- Producing peer reviews as sidecars +- Querying or summarizing existing sidecars +- Structurally comparing two papers (claims, methods, evidence) +- Searching/fetching sidecars from `knows.academy` (~50K papers indexed) + +## When NOT to use + +- General web search or non-academic content -> use `oma-search` +- Translating papers -> use `oma-translator` +- PDF parsing only (no sidecar) -> use `oma-pdf` +- Submitting sidecars back to knows.academy -> out of scope (host LLM only consumes/produces locally) +- Full peer-review workflow with editor system -> out of scope + +## Core Rules + +1. **Target spec is v0.9.0 / `paper@1` profile** — verified against production sidecars from knows.academy; see `resources/sidecar-spec.md` +2. **Host LLM generates sidecars** — never shell out to `anthropic` SDK or external LLM CLI; this skill runs inside an agent +3. **Anti-fabrication** — if DOI/venue/year is not visible in source, **omit the key entirely**; never write `doi: TODO` or guess +4. **Top-level metadata** — `title`, `authors`, `venue`, `year` live at the top level (no `metadata` wrapper) +5. **Field names are exact** — `statement_type`, `evidence_type`, `predicate`, `artifact_type` (not `type`/`claim`) +6. **Provenance has SINGLE actor** — `provenance.actor` is one object, NOT a `provenance.actors` array +7. **Confidence is an object** — `{claim_strength: ..., extraction_fidelity: ...}`, both from `high|medium|low` +8. **Coverage is an object** — `coverage.statements` (4-value enum) + `coverage.evidence` (3-value enum) +9. **Closed enums** — actor `tool|person|org` (never `ai`/`llm`/`model`); artifact role `subject|supporting|cited`; predicates in present tense +10. **Numbers unquoted** — `value: 22`, never `value: '22'` +11. **Relation density** — average ≥1.5 relations per statement; every claim needs `supported_by` evidence (lint warns when ratio is below; orphan statements warned per-id) +12. **ID format** — descriptive kebab-case with prefix: `stmt:privacy-budget-tradeoff`, `ev:cifar10-accuracy-table`, `art:paper` +13. **Validate before sharing** — run `oma scholar lint` after Generate +14. **Remote API has no auth** — `https://knows.academy/api/proxy/*` is public; do not invent auth headers +15. **Partial fetch param is `section` (singular)** — fixed enum `statements|evidence|relations|artifacts|citation` +16. **OpenAlex key is optional** — metadata enrichment only; gracefully degrade when missing +17. **Sidecar content stays English** — schema fields, IDs, statement text follow upstream convention; user-facing responses follow `oma-config.yaml` `language` +18. **Spec drift awareness** — our local rules track v0.9.0 production behavior, which differs from the upstream `knows.md` natural-language description; refresh `resources/upstream-spec-cache.md` periodically + +## Modes + +| Mode | Trigger | Output | +|------|---------|--------| +| **Generate** | "create sidecar from this paper / abstract / draft", "generate `.knows.yaml`" | `{paper}.knows.yaml` (host LLM emits, then `oma scholar lint` validates) | +| **Validate** | "lint this sidecar", "validate `.knows.yaml`" | Pass/fail report with file:line issues | +| **Review** | "peer review this paper as sidecar" | `{paper}.review.knows.yaml` | +| **Analyze** | "summarize this sidecar", "what claims does it make?" | Natural-language answer | +| **Compare** | "compare paper A and paper B structurally" | Diff table (claims/methods/evidence) | +| **Remote** | "find papers on X", "fetch sidecar :id", "get claims only for :id" | Search results / sidecar payload | + +## Provider Fallback (knows.academy → OpenAlex) + +`knows.academy` currently indexes **only 2026 papers** (~50K, mostly arXiv). For +older or non-2026 papers (Transformer 2017, BERT 2018, classics, journals), +the skill automatically falls back to **OpenAlex** for metadata and abstract. + +Use the `oma scholar` CLI subcommands: + +```bash +# Hybrid search: knows first, OpenAlex fallback +oma scholar search "vision language action" + +# Cross-source resolve: figures out which source has the right paper +oma scholar resolve "Attention Is All You Need" + +# Get by id (knows record_id, OpenAlex W-id, or DOI) +oma scholar get "10.48550/arXiv.1706.03762" +``` + +When OpenAlex returns the answer (knows.academy lacks the paper), use the +returned abstract as input to **Mode 1 Generate** to produce a local sidecar. + +## How to Execute + +Follow `resources/execution-protocol.md` step by step for the selected mode. + +## Quick Reference + +### Search (knows + auto OpenAlex fallback) +```bash +oma scholar search "diffusion super resolution" +oma scholar search --year-min 2024 "vision language action" +``` + +### Find one specific paper +```bash +oma scholar resolve "Attention Is All You Need" +# returns top hit from each source + recommendation +``` + +### Fetch a sidecar or work +```bash +# knows.academy full sidecar +oma scholar get "knows:generated/reconvla/1.0.0" + +# Partial fetch (claims only — ~700 tokens, 93% reduction vs PDF) +oma scholar get --section statements "knows:generated/reconvla/1.0.0" + +# By DOI or OpenAlex W-id (works regardless of knows.academy availability) +oma scholar get "10.48550/arXiv.1706.03762" +``` + +When knows.academy is unreachable, `get knows:...` automatically falls back +to OpenAlex by extracting the slug from the record_id. The result is marked +with `fallback: "openalex"` and contains metadata + abstract — useful for +running Mode 1 Generate locally. + +### Validate +```bash +# Strict — for own Generate output (default) +oma scholar lint paper.knows.yaml + +# Lenient — for third-party / fetched sidecars +oma scholar lint --lenient remote.knows.yaml + +# Treat warnings as failures (CI mode) +oma scholar lint --fail-on-warning paper.knows.yaml +``` + +About **47% of knows.academy-served sidecars contain at least one dangling +cross-reference** (typo in `subject_ref`/`object_ref`, measured across 15 +production samples). Use `--lenient` when consuming third-party records so +these surface as warnings rather than blocking errors. + +### Raw API (when CLI is unavailable) +```bash +curl -s "https://knows.academy/api/proxy/search?q=..." +curl -s "https://knows.academy/api/proxy/sidecars/<encoded-id>" +curl -s "https://knows.academy/api/proxy/partial?record_id=<id>§ion=statements" +curl -s "https://knows.academy/api/proxy/jobs/stats" # platform health +``` + +## Configuration + +Project-specific settings: `config/scholar-config.yaml` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `[ERROR] *.value: numeric value '22' is quoted` | Remove quotes: `value: '22'` -> `value: 22` | +| `[ERROR] provenance.actor.type: 'ai' is not allowed` | Change to `tool`, `person`, or `org` | +| `[ERROR] *.type: use \`statement_type\` instead of \`type\`` | Rename `type` -> `statement_type` (or `evidence_type`/`predicate`/`artifact_type`) | +| `[ERROR] provenance.actors: v0.9 spec uses singular \`actor\`` | Replace `actors: [{...}]` array with `actor: {...}` object | +| `[ERROR] *.object_ref: reference 'X' does not match any defined id` | Fix the `subject_ref`/`object_ref` to point to a real id, OR use `--lenient` if consuming third-party data | +| `[WARN] relations: avg relations/statement is N.NN (target ≥ 1.5)` | Add more `supported_by`/`depends_on` relations | +| `[WARN] statements: only N statements — most papers warrant ≥ 8` | Expected when generating from abstract only; full-paper Generate should hit 15+ | +| `[WARN] *.predicate: past-tense '...' is suspicious` | Switch to present tense (`evaluated_on` -> `evaluates_on`) | +| Remote API returns empty results | Try broader query; check `/api/proxy/jobs/stats`; CLI auto-falls-back to OpenAlex | +| `knows.academy search failed: fetch failed` (stderr) | Platform timeout — fallback to OpenAlex is automatic; retry later for sidecars | +| OpenAlex 403/429 | Set `OPENALEX_API_KEY` (see `resources/setup-openalex.md`) | +| YAML won't parse | Check indentation; numbers/booleans must be unquoted; strings with `:` need quotes | + +## References + +- Execution steps: `resources/execution-protocol.md` +- Sidecar spec rules: `resources/sidecar-spec.md` +- API endpoints: `resources/api-endpoints.md` +- OpenAlex setup: `resources/setup-openalex.md` +- Upstream spec snapshot: `resources/upstream-spec-cache.md` +- Post-generation checklist: `resources/checklist.md` +- CLI subcommands: `oma scholar search|resolve|get|lint` (implementation under `cli/commands/scholar/`) +- Context loading: `../_shared/core/context-loading.md` +- Quality principles: `../_shared/core/quality-principles.md` +- i18n rules: `../../rules/i18n-guide.md` diff --git a/.agents/skills/oma-scholar/config/scholar-config.yaml b/.agents/skills/oma-scholar/config/scholar-config.yaml new file mode 100644 index 0000000..70b64f1 --- /dev/null +++ b/.agents/skills/oma-scholar/config/scholar-config.yaml @@ -0,0 +1,67 @@ +# oma-scholar configuration +# Knows sidecar generation and remote access settings + +# Remote API +api: + base_url: https://knows.academy + endpoints: + search: /api/proxy/search + sidecar: /api/proxy/sidecars/{record_id} + partial: /api/proxy/partial + stats: /api/proxy/jobs/stats + # No auth required for proxy endpoints + timeout_seconds: 30 + +# Partial-fetch defaults — fields you can request individually +partial_fields: + available: + - statements + - evidence + - relations + - artifacts + - methods + - provenance + default_for_summary: + - statements + - provenance + +# OpenAlex metadata enrichment (optional) +# Used during Generate mode to backfill DOI/venue/year when missing +openalex: + enabled_when_key_present: true + env_var: OPENALEX_API_KEY + # Polite-pool email also accepted for higher anonymous rate limits + polite_pool_env_var: OPENALEX_EMAIL + signup_url: https://openalex.org/ + settings_url: https://openalex.org/settings/api + # Free daily allowance (no key): $1 value, ~10K list calls, ~1K search calls + +# Sidecar generation defaults +generation: + # Minimum number of statements before warning user about under-extraction + min_statements_complex_paper: 15 + # Average relations per statement — relation wiring quality threshold + min_relations_per_statement: 1.5 + # ID format + id_prefixes: + statement: "stmt:" + evidence: "ev:" + relation: "rel:" + method: "method:" + artifact: "art:" + actor: "actor:" + +# Output behavior +output: + # When no output path is specified: + # "same_dir" = output next to the input source file + # "cwd" = output in current working directory + default_location: same_dir + filename_pattern: "{base}.knows.yaml" + review_filename_pattern: "{base}.review.knows.yaml" + overwrite: false # Prompt before overwriting existing files + +# Lint behavior +lint: + command: "oma scholar lint" + fail_on_warning: false diff --git a/.agents/skills/oma-scholar/resources/api-endpoints.md b/.agents/skills/oma-scholar/resources/api-endpoints.md new file mode 100644 index 0000000..e0d0bf7 --- /dev/null +++ b/.agents/skills/oma-scholar/resources/api-endpoints.md @@ -0,0 +1,174 @@ +# knows.academy API Endpoints + +Base URL: `https://knows.academy` + +All endpoints are public proxies — **no authentication required**. Do not invent or send `Authorization` headers. + +## Search + +``` +GET /api/proxy/search?q={query} +GET /api/proxy/search?q={query}&discipline={field} +``` + +```bash +curl -s "https://knows.academy/api/proxy/search?q=diffusion+super+resolution" +``` + +Response shape (verified): +```json +{ + "results": [ + { + "record_id": "knows:generated/{slug}/{version}", + "profile": "paper@1", + "title": "...", + "summary": "...", + "venue": "...", + "year": 2026, + "discipline": null, + "keywords": [], + "coverage_statements": "exhaustive", + "coverage_evidence": "key_evidence_only", + "provenance_origin": "machine", + "provenance_actor_name": "knows-gen", + "version_record": "1.0.0", + "lint_passed": true, + "download_count": 0, + "created_at": "...", + "stats": {...} + } + ] +} +``` + +`record_id` format: `knows:generated/{slug}/{version}` — includes colons and slashes that **must be URL-encoded** when used as a path/query param. + +## Fetch Full Sidecar + +``` +GET /api/proxy/sidecars/{record_id} +``` + +The full v0.9.0 record (~22 KB / ~4.5K tokens — 55% smaller than the source PDF). + +```bash +RID=$(python3 -c "import urllib.parse; print(urllib.parse.quote('knows:generated/{slug}/1.0.0', safe=''))") +curl -s "https://knows.academy/api/proxy/sidecars/${RID}" +``` + +## Partial Fetch (Token-Saving) + +``` +GET /api/proxy/partial?record_id={id}§ion={one_of} +``` + +Note: parameter is **`section` (singular)**, value must match `^(statements|evidence|relations|artifacts|citation)$`. + +Available sections (verified): +- `statements` (typically ~700 tokens, 93% reduction vs full PDF) +- `evidence` +- `relations` +- `artifacts` +- `citation` + +**Not available** via partial fetch: `methods` (statement_type), `provenance` — fetch the full sidecar instead. + +```bash +RID=$(python3 -c "import urllib.parse; print(urllib.parse.quote('knows:generated/{slug}/1.0.0', safe=''))") + +# Just the claims +curl -s "https://knows.academy/api/proxy/partial?record_id=${RID}§ion=statements" + +# Evidence only +curl -s "https://knows.academy/api/proxy/partial?record_id=${RID}§ion=evidence" +``` + +Response shape: +```json +{ + "record_id": "...", + "items": [...] +} +``` + +**When to use partial vs full:** +- "What claims does this paper make?" -> `section=statements` only +- "What experiments?" -> `section=evidence` +- "What does it cite?" -> `section=citation` (or `section=artifacts` filtered by `role: cited`) +- Multi-section analysis -> fetch full sidecar (cheaper than 3 partial calls) + +## Platform Stats + +``` +GET /api/proxy/jobs/stats +``` + +```bash +curl -s "https://knows.academy/api/proxy/jobs/stats" +``` + +Returns processing queue health (verified): +```json +{ + "pending": 10735, + "running": 288, + "completed": 37897, + "failed": 1243, + "skipped": 433, + "total": 50596 +} +``` + +Use this for: +- Health check before bulk operations +- Estimating coverage ("how many papers are in the index?") +- Diagnosing slow responses + +## Skill Self-Description + +``` +GET /api/proxy/skill/knows.md +``` + +The canonical generation guide (natural language, slightly different from the JSON Schema). Refresh `upstream-spec-cache.md` from here. + +## JSON Schema (Reference) + +Schema id used by sidecars: `https://knows.dev/schema/record-0.9.json` + +The host `knows.dev` is currently unreachable (HTTPS times out as of 2026-04-25), so schema cannot be fetched directly. The `oma scholar lint` subcommand implements rules empirically derived from production records. + +## Rate Limiting + +No hard limits documented. Be polite: +- Avoid bursting more than ~10 req/sec +- Cache responses locally for repeated queries within a session +- Prefer partial fetch over full when possible + +## Error Handling + +| Status | Meaning | Recovery | +|--------|---------|----------| +| 404 | record_id not found | Verify ID via `/search`; the record may not be sidecared yet | +| 422 | Validation error (bad params) | Check `section` value against the allowed enum | +| 5xx | Platform issue | Retry once; check `/jobs/stats` | +| Timeout | Network / slow response | Retry with longer timeout; fall back to local sidecar if cached | + +## URL Encoding + +`record_id` contains `:` and `/` — always URL-encode when placing in a path or query parameter. Examples: + +```bash +# Bash with python3 +RID=$(python3 -c "import urllib.parse; print(urllib.parse.quote('$RAW_RID', safe=''))") + +# Or with jq +echo -n "$RAW_RID" | jq -sRr @uri +``` + +## Notes + +- Endpoints starting with `/api/proxy/` are stable public routes +- Direct (non-proxy) endpoints may exist but are not contract-stable — do not rely on them +- This skill **only consumes** the API. Submission/upload is out of scope diff --git a/.agents/skills/oma-scholar/resources/checklist.md b/.agents/skills/oma-scholar/resources/checklist.md new file mode 100644 index 0000000..d1ced73 --- /dev/null +++ b/.agents/skills/oma-scholar/resources/checklist.md @@ -0,0 +1,88 @@ +# Post-Generation Checklist (v0.9.0) + +Run this after Mode 1 (Generate) or Mode 3 (Review) before reporting done. + +## Top-Level Structure + +- [ ] `title` is set at the **top level** (not `metadata.title`) +- [ ] `authors` is a top-level list of strings +- [ ] `venue`, `year`, `doi` keys present **only** if visible in source (no TODO/TBD) +- [ ] `knows_version` set (e.g., `"0.9.0"`) +- [ ] `profile` set (e.g., `"paper@1"`) +- [ ] `subject_ref` points to an `art:` artifact id +- [ ] `coverage` is an object with `statements` and `evidence` keys (each from its own enum) +- [ ] `provenance` is present with single `actor` object (not `actors` array) +- [ ] `version` block has `spec`, `record`, `source` +- [ ] `freshness` block has `as_of`, `update_policy` + +## Provenance + +- [ ] `provenance.origin` is `machine` or `author` +- [ ] `provenance.actor.type` is `tool`, `person`, or `org` (never `ai`/`llm`/`model`) +- [ ] `provenance.actor.name` is set +- [ ] `provenance.method` describes how the sidecar was produced (e.g., `extraction`) +- [ ] `provenance.generated_at` is a valid ISO timestamp + +## IDs + +- [ ] All IDs use descriptive kebab-case (no `stmt:c1`, `ev:001`) +- [ ] Type prefixes correct: `stmt:`, `ev:`, `rel:`, `art:`, `rep:` +- [ ] No duplicate IDs across the document + +## Field Names + +- [ ] Statements use `statement_type` (not `type` or `claim`) +- [ ] Evidence uses `evidence_type` +- [ ] Relations use `predicate` +- [ ] Artifacts use `artifact_type` + +## Statement Internals + +- [ ] Each statement has `statement_type` from: `claim`, `method`, `limitation`, `assumption`, `definition`, `question` +- [ ] Each statement has `modality` from: `descriptive`, `empirical`, `theoretical` +- [ ] Each statement has `status` (commonly `asserted`) +- [ ] `confidence` is an object: `{claim_strength: ..., extraction_fidelity: ...}`, both from `high|medium|low` +- [ ] `source_anchors` reference a valid `representation_ref` (e.g., `rep:paper-pdf`) + +## Values + +- [ ] Numbers unquoted (`value: 22`, not `value: '22'`) +- [ ] `coverage.statements` from: `exhaustive`, `main_claims_only`, `key_claims_and_limitations`, `partial` +- [ ] `coverage.evidence` from: `exhaustive`, `key_evidence_only`, `partial` +- [ ] `artifacts[].role` from: `subject`, `supporting`, `cited` +- [ ] Predicates use present tense (`evaluates_on`, not `evaluated_on`) + +## Relations + +- [ ] Every statement has at least one relation (incoming or outgoing) +- [ ] Every claim has a `supported_by` relation pointing to evidence +- [ ] Average relations-per-statement ratio ≥ 1.5 +- [ ] Methods have at least one of: `implements`, `uses`, `evaluates_on`, `documents` +- [ ] No dangling references — every `subject_ref` and `object_ref` points to an existing id + +## Density + +- [ ] Statement count appropriate for paper length (complex papers ≥ 15) +- [ ] Limitations and discussion mined for additional statements +- [ ] No important section ignored (abstract, methods, results, discussion, limits) + +## Lint + +- [ ] `oma scholar lint` returns 0 errors +- [ ] Warnings reviewed (recommended-key warnings are usually acceptable for local drafts) + +## Anti-Fabrication + +- [ ] No fabricated DOIs, ORCIDs, or URLs +- [ ] No "TODO", "TBD", "N/A" placeholder values +- [ ] All quoted statement text is paraphrased or quoted accurately from source +- [ ] No invented author names or affiliations + +## Final Report to User + +Include: +- Output file path +- Counts: `statements`, `evidence`, `relations`, `artifacts` +- Ratio: relations/statements (target ≥ 1.5) +- Lint status: clean / N warnings / N errors +- Fields explicitly omitted due to anti-fabrication (e.g., "DOI not visible — please paste if you have it") diff --git a/.agents/skills/oma-scholar/resources/execution-protocol.md b/.agents/skills/oma-scholar/resources/execution-protocol.md new file mode 100644 index 0000000..3f64718 --- /dev/null +++ b/.agents/skills/oma-scholar/resources/execution-protocol.md @@ -0,0 +1,196 @@ +# Scholar - Execution Protocol + +Choose the mode based on user intent, then follow its steps. + +## Mode 1: Generate + +Create a `.knows.yaml` sidecar from a paper, draft, or research notes. + +### Step 0: Gather Source + +1. Identify input: PDF path, LaTeX file, plain text, or pasted content +2. If PDF and only path is given -> chain via `oma-pdf` first to extract markdown +3. Determine output path: `{base}.knows.yaml` next to input (or as configured) + +### Step 1: Read Source Thoroughly + +- Read the entire source — abstract, methods, results, discussion, limitations +- Identify: claims, methods, evidence (tables/figures), datasets, code/repo links, cited works, assumptions +- Note paper metadata: title, authors, venue, year, DOI (only if visible) + +### Step 2: Draft Sidecar Structure + +Use the v0.9.0 spec rules in `sidecar-spec.md`. Top-level structure: + +- `knows_version: "0.9.0"`, `profile: "paper@1"`, `subject_ref: art:paper` +- **Top-level metadata** — `title`, `authors`, `venue`, `year` (no `metadata` wrapper); omit `doi`/`venue`/`year` if not visible +- `summary` — one-paragraph overview +- `coverage` — object with `statements` and `evidence` enums (NOT a single value) +- `provenance` — `origin`, single `actor` object (`type: tool|person|org`), `generated_at`, `method` +- `version` — `{spec, record, source}` +- `freshness` — `{as_of, update_policy}` +- `artifacts` — list (`art:` prefix, `artifact_type`, `role: subject|supporting|cited`); include `representations` for the source +- `statements` — list (`stmt:` prefix, `statement_type`, `modality`, `status`, `confidence` object, `source_anchors`) +- `evidence` — list (`ev:` prefix, `evidence_type`) +- `relations` — list (`rel:` prefix, `predicate` in present tense, `subject_ref`, `object_ref`) +- `actions` — usually empty for paper profile + +### Step 3: Wire Relations + +For every statement, ensure at least one `supported_by` relation pointing to evidence. +Aim for **average ≥1.5 relations per statement**. Common patterns: + +| Subject | Predicate | Object | +|---------|-----------|--------| +| Claim | `supported_by` | Evidence | +| Claim | `depends_on` | Assumption | +| Method | `evaluates_on` | Dataset | +| Method | `implements` | Code repo | +| Paper | `cites` | Other paper | +| Result | `contradicts` | Prior work | + +### Step 4: Optional Metadata Enrichment + +If `OPENALEX_API_KEY` is set and DOI/venue/year are missing: + +```bash +# Sketch — actual enrichment is best done via host LLM with a curl call +curl -s "https://api.openalex.org/works?search={title}&api_key=$OPENALEX_API_KEY" \ + | jq '.results[0] | {doi, host_venue, publication_year}' +``` + +If the key is not set, skip enrichment and tell the user how to set it (point to `setup-openalex.md`). + +### Step 5: Lint + +Always validate before reporting done: + +```bash +oma scholar lint {output}.knows.yaml +``` + +If lint fails, fix the reported issues and re-run until clean. + +### Step 6: Report + +Tell the user: +- Output path +- Statement count, evidence count, relations/statement ratio +- Any fields omitted due to anti-fabrication (e.g., "DOI not extracted — visible in source? If yes, paste it.") +- Lint status + +## Mode 2: Validate + +Lint an existing `.knows.yaml`. + +### Steps + +1. Confirm input file exists and ends with `.knows.yaml`, `.yaml`, or `.json` +2. Decide strictness based on origin: + - **Own Generate output** → strict (default) + - **Third-party / remote sidecar** → `--lenient` (dangling refs become warnings) +3. Run lint: + ```bash + oma scholar lint {input} + # OR for fetched sidecars: + oma scholar lint --lenient {input} + ``` +4. Read output; group findings by severity (error/warning) +5. Report file:line for each issue with the rule violated and the fix +6. Offer to apply fixes if the user agrees + +## Mode 3: Review + +Generate a peer-review sidecar — what claims need stronger evidence, what assumptions are unstated, what limitations are missing. + +### Steps + +1. Read the source paper (or its existing sidecar) +2. For each statement in the paper, evaluate: + - Is `supported_by` evidence sufficient? + - Is `depends_on` assumption explicit? + - Are limitations acknowledged? +3. Produce review sidecar at `{base}.review.knows.yaml` with: + - top-level `coverage`: `{statements: key_claims_and_limitations, evidence: partial}` + - `statements` capturing reviewer assertions, each with `statement_type: review_comment` (or `limitation`) + - `relations` linking review comments to original paper statements via `predicate: critiques` or `extends` +4. Lint and report (same as Generate) + +## Mode 4: Analyze / Query + +Answer natural-language questions over an existing sidecar. + +### Steps + +1. Locate sidecar: local path or `oma-scholar fetch {record_id}` from knows.academy +2. For token efficiency, prefer **partial fetch** when only a subset is needed. + The query param is **`section` (singular)**; allowed values are `statements|evidence|relations|artifacts|citation`: + - "What claims does this paper make?" -> `?section=statements` + - "What are the experimental results?" -> `?section=evidence` + - "Who do they cite?" -> `?section=citation` (or `?section=artifacts` filtered by `role: cited`) +3. Parse YAML, traverse structure to answer +4. Cite IDs (`stmt:...`, `ev:...`) so the user can verify + +## Mode 5: Compare + +Structural diff between two sidecars. + +### Steps + +1. Load both sidecars (local or remote) +2. Build comparison table: + - **Claims overlap**: shared `statement_type` themes + - **Method differences**: same dataset? different metrics? + - **Evidence quality**: relations-per-statement ratio for each + - **Citation overlap**: shared `artifacts` with `role: cited` +3. Output a markdown table; cite specific IDs from each paper +4. Optional: surface contradicting claims (search for `predicate: contradicts` patterns) + +## Mode 6: Remote (with OpenAlex fallback) + +Search and fetch from knows.academy first, then fall back to OpenAlex when the +paper isn't in the (2026-only) knows.academy index. + +### Recommended path: `oma scholar` (handles cascade automatically) + +```bash +# Hybrid search +oma scholar search "<query>" + +# Cross-source resolve (decides which source has the right paper) +oma scholar resolve "<title>" + +# Get specific record (knows id, OpenAlex W-id, or DOI) +oma scholar get [--section <one>] "<id>" +``` + +See `resources/fallback-providers.md` for full cascade design. + +### Manual cascade (raw curl) + +1. **knows.academy search** — `curl -s "{base}/api/proxy/search?q={query}"` +2. If hits → fetch sidecar: + - Full: `/api/proxy/sidecars/{record_id}` + - Partial: `/api/proxy/partial?record_id={id}§ion=statements|evidence|relations|artifacts|citation` +3. If no hits OR clearly wrong paper (cross-source title similarity < 0.7) → + **OpenAlex fallback**: + - `curl -s "https://api.openalex.org/works?search={query}"` (anonymous OK) + - Or by DOI: `https://api.openalex.org/works/doi:{doi}` + - Reconstruct abstract from `abstract_inverted_index` +4. If user wants a sidecar from the OpenAlex result → transition to **Mode 1 Generate** + using the abstract as the source text. The local sidecar will not be on + knows.academy but is structurally identical. +5. **Stats** (health check): `curl -s "{base}/api/proxy/jobs/stats"` +6. If user asks for analysis after fetch, transition to **Mode 4** (Analyze) + +## Error Recovery + +| Error | Recovery | +|-------|----------| +| `oma scholar` not found | Run `oma install` to install / update the CLI | +| Lint script reports many errors | Fix top error first; re-run; cascading errors often resolve together | +| Remote API timeout | Retry once; if still failing, check `/api/proxy/jobs/stats` | +| Empty search results | Broaden query; remove quotes; try keyword-only | +| YAML parse error after Generate | Re-emit with stricter quoting on string values containing `:` `#` `&` `*` | +| Source PDF has no text layer | Chain via `oma-pdf` with hybrid OCR mode first | +| OpenAlex 403/429 | Tell user to set `OPENALEX_API_KEY` — see `setup-openalex.md` | diff --git a/.agents/skills/oma-scholar/resources/fallback-providers.md b/.agents/skills/oma-scholar/resources/fallback-providers.md new file mode 100644 index 0000000..03f70bf --- /dev/null +++ b/.agents/skills/oma-scholar/resources/fallback-providers.md @@ -0,0 +1,125 @@ +# Fallback Provider Cascade + +`oma-scholar` queries `knows.academy` first, then falls back to OpenAlex for +records the platform doesn't have. The fallback is what makes the skill useful +beyond the (currently 2026-only) knows.academy index. + +## Coverage Matrix + +| Source | Coverage | Gives | Doesn't give | +|--------|----------|-------|--------------| +| **knows.academy** | ~50K papers, **2026 only** (verified empirically) | Full v0.9 sidecar (claims, evidence, relations, methods) | Pre-2026 papers, full text | +| **OpenAlex** | ~240M works, all years | Title, authors, year, venue, DOI, abstract (reconstructed), OA PDF URL, citation count, references | Structured claims/evidence (no sidecar) | + +When a paper is in **both**, prefer knows.academy (richer structure). When a +paper is **only in OpenAlex** (most pre-2026 work), use OpenAlex metadata as +the source for **Mode 1 Generate** — host LLM produces a local sidecar from +the abstract. + +## Cascade Logic + +``` +search "query" + │ + ├─ knows.academy ── any hits? ──► return immediately (sidecars available) + │ │ + │ no hits + │ ▼ + └─ openalex ─────────────────► return metadata + abstract (no sidecar) + ↓ + user wants deeper? → Mode 1 Generate locally +``` + +## CLI Reference (`oma scholar`) + +### Search both sources + +```bash +oma scholar search "vision language action" +``` + +Returns JSON with: +- `primary`: which source returned hits +- `fallback`: which source was used as fallback (if any) +- `results`: unified list with `source: "knows.academy" | "openalex"` per item + +### Get a specific record + +```bash +# knows.academy sidecar +oma scholar get "knows:generated/reconvla/1.0.0" + +# Partial fetch (saves up to 93% tokens) +oma scholar get --section statements "knows:generated/reconvla/1.0.0" + +# OpenAlex by DOI +oma scholar get "10.48550/arXiv.1706.03762" + +# OpenAlex by W-id +oma scholar get "W2147144213" +``` + +When `oma scholar get` is asked for a `knows:...` id and the platform is +unreachable, the command extracts the slug from the record_id and searches +OpenAlex for the same paper, returning metadata with a `fallback: "openalex"` +marker. The user can then run **Mode 1 Generate** from the abstract. + +### Resolve a title across both + +```bash +oma scholar resolve "Attention Is All You Need" +``` + +Returns the best match from each source side-by-side, plus a recommendation. + +## When Fallback is Needed + +Fallback to OpenAlex is the right move when: + +- **Pre-2026 papers** — knows.academy doesn't index them +- **Cross-archive coverage** — non-arXiv venues, journals, books +- **Citation counts** — knows.academy doesn't track citations; OpenAlex does +- **Reference resolution** — when a sidecar's `cites` predicate points to a paper not in knows.academy + +Fallback is NOT needed for: +- Recent (2026) arXiv papers — knows.academy has them with rich sidecars +- Full-text reading — neither source provides full text; fetch the OA PDF and use `oma-pdf` + +## Local Generate from Fallback Result + +When OpenAlex returns metadata + abstract, but you want a structured sidecar: + +``` +1. oma scholar get <doi> → get title, authors, year, venue, abstract +2. (optional) download OA PDF → oma-pdf to extract full text +3. Mode 1 Generate → host LLM produces .knows.yaml using sidecar-spec.md +4. oma scholar lint → validate locally +``` + +The locally-generated sidecar is **not registered** with knows.academy — it's +yours. If you want it shared, that's a separate publishing flow (out of scope +for this skill). + +## Authentication + +OpenAlex anonymous use is free up to ~$1/day. For higher limits or polite +pool access, see `setup-openalex.md`. The skill works without any key. + +`oma scholar` reads: +- `OPENALEX_API_KEY` — passed as `?api_key=` (recommended) +- `OPENALEX_EMAIL` — passed as `?mailto=` (polite pool, no signup) + +Neither is required. + +## Trust Considerations + +- **knows.academy sidecars are AI-generated** — all have `provenance.origin: machine`. Verified `lint_passed: true` is platform-internal; our local `oma scholar lint` finds dangling refs in ~47% (use `--lenient` when consuming) +- **OpenAlex metadata is curated** — generally reliable for title/authors/DOI/year, but venue and abstract can be missing for older works +- **Reconstructed abstracts** from OpenAlex's inverted index are exact — no paraphrasing — but punctuation/formatting may be lossy + +## Limitations + +- arXiv-only papers without DOI may show `doi: None` in OpenAlex +- Some 2026 papers exist in both sources with slightly different titles — `resolve` does case-insensitive contain matching as a heuristic +- knows.academy proxy can timeout under load; fallback to OpenAlex is automatic +- OpenAlex rate-limits: respect ~10 req/sec courtesy limit diff --git a/.agents/skills/oma-scholar/resources/setup-openalex.md b/.agents/skills/oma-scholar/resources/setup-openalex.md new file mode 100644 index 0000000..379a63c --- /dev/null +++ b/.agents/skills/oma-scholar/resources/setup-openalex.md @@ -0,0 +1,85 @@ +# OpenAlex API Key Setup (Optional) + +OpenAlex powers metadata enrichment during Generate mode — backfilling missing DOI, venue, and year from a vast academic catalog. **The skill works without a key**, but enrichment is skipped. + +## When you need this + +- You frequently generate sidecars from raw text/LaTeX where DOI is not visible +- You want DOI/venue auto-resolved from the title +- You hit anonymous rate limits ("$1/day" allowance exhausted) + +## Free Tier (Generous) + +Without any key, OpenAlex grants $1/day equivalent free quota: +- **Unlimited** single-entity lookups +- **10,000** list/filter calls per day +- **1,000** search calls per day +- **100** content downloads per day + +For most users, anonymous use is enough. + +## Get a Key (30 seconds) + +1. Sign up: https://openalex.org/ +2. Generate key: https://openalex.org/settings/api +3. Copy the key + +## Configure + +### Option A: Per-shell environment + +```bash +export OPENALEX_API_KEY="your_key_here" +``` + +### Option B: Persisted in `~/.claude/.env` (recommended) + +```bash +mkdir -p ~/.claude +echo 'OPENALEX_API_KEY=your_key_here' >> ~/.claude/.env +``` + +Some Claude Code setups source this file automatically; if not, source it from your shell rc: + +```bash +echo '[ -f ~/.claude/.env ] && set -a && . ~/.claude/.env && set +a' >> ~/.zshrc +``` + +### Option C: Polite-pool email only (no key) + +If you don't want to sign up but want priority, set just an email: + +```bash +export OPENALEX_EMAIL="you@example.com" +``` + +This puts you in the "polite pool" with better latency, no key required. + +## Verify + +```bash +curl -s "https://api.openalex.org/works?search=attention+is+all+you+need&api_key=$OPENALEX_API_KEY" \ + | head -c 500 +``` + +A 200 response with JSON results means the key is live. + +## Pricing Beyond Free Tier + +OpenAlex uses pay-as-you-go past the daily free allowance — no monthly subscription. Most academic users never exceed the free tier. See https://openalex.org/pricing for current rates. + +## Skill Behavior + +When this skill needs metadata enrichment: + +1. Check `OPENALEX_API_KEY` env var +2. If set -> use authenticated calls +3. If not set, check `OPENALEX_EMAIL` -> anonymous polite-pool +4. If neither, fall back to anonymous calls +5. On 403/429 -> stop enrichment, tell user to set the key, leave fields omitted (anti-fabrication) + +## Privacy + +- The key (or email) is sent to OpenAlex servers as a query parameter +- No source paper content is sent to OpenAlex — only title/author search strings +- OpenAlex is operated by OurResearch (a non-profit); see their privacy policy at https://openalex.org/ diff --git a/.agents/skills/oma-scholar/resources/sidecar-spec.md b/.agents/skills/oma-scholar/resources/sidecar-spec.md new file mode 100644 index 0000000..8db6ae6 --- /dev/null +++ b/.agents/skills/oma-scholar/resources/sidecar-spec.md @@ -0,0 +1,312 @@ +# Knows Sidecar Spec — Generation Rules (v0.9.0) + +This is the v0.9.0 record shape verified against `knows.academy` production sidecars +(JSON Schema id: `https://knows.dev/schema/record-0.9.json`, profile `paper@1`). +For the upstream natural-language description see `upstream-spec-cache.md`. + +## Top-level Structure (paper@1 profile) + +```yaml +$schema: "https://knows.dev/schema/record-0.9.json" +knows_version: "0.9.0" +record_id: "knows:generated/{slug}/1.0.0" # only for published records; omit for local drafts +profile: "paper@1" +subject_ref: "art:paper" # points to the artifact representing the paper itself + +# Top-level metadata (NOT inside a `metadata` block) +title: "..." +authors: ["..."] +venue: "..." # only if visible in source +year: 2026 # only if visible in source +summary: "..." + +# Coverage is an object, not a single value +coverage: + statements: exhaustive | main_claims_only | key_claims_and_limitations | partial + evidence: exhaustive | key_evidence_only | partial + +license: "CC-BY-4.0" # if known + +artifacts: [...] +statements: [...] +evidence: [...] +relations: [...] +actions: [] # usually empty for paper profile + +# Provenance has SINGLE actor (object), not actors (array) +provenance: + origin: machine | author + actor: + name: "knows-gen" + type: tool # tool | person | org (NEVER ai/llm/model) + version: "0.9.0" + generated_at: "2026-04-25T00:00:00Z" + method: extraction + +version: + spec: "0.9.0" + record: "1.0.0" + source: original + +freshness: + as_of: "2026-04-25T00:00:00Z" + update_policy: versioned +``` + +## Statement Shape + +```yaml +- id: stmt:descriptive-kebab-case + statement_type: claim # claim | method | limitation | assumption | definition | question + modality: empirical # descriptive | empirical | theoretical + text: "..." + about_ref: art:paper # what the statement is about (usually the subject artifact) + status: asserted + source_anchors: + - representation_ref: rep:paper-pdf + locator_type: section + locator: "Section 5" + confidence: + claim_strength: high # high | medium | low + extraction_fidelity: high # high | medium | low + provenance: + origin: machine + actor: + name: "knows-gen" + type: tool + generated_at: "..." +``` + +### Statement Type Frequencies (252 statements / 15 papers) + +| Type | Frequency | Use | +|------|-----------|-----| +| `claim` | 117 (46%) | Headline assertion of the paper | +| `method` | 58 (23%) | Procedure, technique, or pipeline | +| `limitation` | 31 (12%) | Acknowledged limit or threat to validity | +| `assumption` | 18 (7%) | Precondition for the claim/method | +| `definition` | 18 (7%) | Term or concept introduction | +| `question` | 10 (4%) | Open research question / RQ | + +## Evidence Shape + +```yaml +- id: ev:descriptive-kebab-case + evidence_type: table_result # see frequencies below + summary: "..." + source_anchors: [...] + provenance: {...} +``` + +### Evidence Type Frequencies (138 evidence / 15 papers) + +| Type | Frequency | Use | +|------|-----------|-----| +| `table_result` | 64 (46%) | Numeric results in a table | +| `figure` | 34 (25%) | Graphical/diagram-based result | +| `proof` | 15 (11%) | Theoretical/mathematical proof | +| `observation` | 11 (8%) | Qualitative observation | +| `experiment_run` | 6 (4%) | Single experimental run/trial | +| `case_study` | 4 (3%) | Detailed walkthrough of a case | +| `citation_backed` | 4 (3%) | Cited from another paper | + +## Artifact Shape + +```yaml +- id: art:paper + artifact_type: paper # see frequencies below + role: subject # subject | supporting | cited + title: "..." + identifiers: + url: "https://arxiv.org/abs/..." + doi: "..." # OMIT entirely if not visible in source + representations: + - id: rep:paper-pdf + media_type: application/pdf + locator: + type: path | url + value: "..." +``` + +### Artifact Type Frequencies (50 artifacts / 15 papers) + +| Type | Frequency | Use | +|------|-----------|-----| +| `paper` | 15 (30%) | The subject paper or a cited paper | +| `dataset` | 12 (24%) | Training/eval dataset | +| `benchmark` | 10 (20%) | Standard benchmark suite | +| `repository` | 6 (12%) | Code repository (GitHub etc.) | +| `model` | 4 (8%) | Pre-trained or released model | +| `software` | 2 (4%) | Library/toolkit | +| `other` | 1 (2%) | Catch-all | + +### Artifact Role Frequencies (50 / 15 papers) + +| Role | Frequency | Use | +|------|-----------|-----| +| `cited` | 18 (36%) | Referenced as prior work | +| `supporting` | 17 (34%) | Used by the paper (dataset, code, model) | +| `subject` | 15 (30%) | The paper itself (always 1 per record) | + +## Relation Shape + +```yaml +- id: rel:descriptive-kebab-case + predicate: supported_by # see "Predicates" section below + subject_ref: stmt:... + object_ref: ev:... | stmt:... | art:... +``` + +## Field Naming (copy exactly) + +| Concept | Correct field | Wrong | +|---------|---------------|-------| +| Statement category | `statement_type` | `type`, `claim` | +| Evidence category | `evidence_type` | `type` | +| Relation verb | `predicate` | `type`, `relation_type` | +| Artifact category | `artifact_type` | `type` | +| Actor category | `type` (with closed enum) | — | +| Top-level metadata | `title`, `authors`, `venue`, `year` (no wrapper) | `metadata.title` etc. | +| Provenance attribution | `provenance.actor` (single object) | `provenance.actors` (array) | +| Confidence | object with `claim_strength` + `extraction_fidelity` | bare string | + +## Value Constraints + +| Field | Allowed values | +|-------|----------------| +| `actor.type` | `tool` \| `person` \| `org` (never `ai`/`llm`/`model`) | +| `provenance.origin` | `machine` (AI-generated) \| `author` (human curation) | +| `confidence.claim_strength` | `high` \| `medium` \| `low` | +| `confidence.extraction_fidelity` | `high` \| `medium` \| `low` | +| `coverage.statements` | `exhaustive` \| `main_claims_only` \| `key_claims_and_limitations` \| `partial` | +| `coverage.evidence` | `exhaustive` \| `key_evidence_only` \| `partial` | +| `artifacts[].role` | `subject` \| `supporting` \| `cited` | +| `statement.status` | `asserted` (the only value observed across 252 production statements) | +| `statement.modality` | `descriptive` \| `empirical` \| `theoretical` | +| `statement_type` | `claim` \| `method` \| `limitation` \| `assumption` \| `definition` \| `question` | +| `evidence_type` | `table_result` \| `figure` \| `proof` \| `observation` \| `experiment_run` \| `case_study` \| `citation_backed` | +| `artifact_type` | `paper` \| `dataset` \| `benchmark` \| `repository` \| `model` \| `software` \| `other` | + +## Numeric Values + +```yaml +# Correct +- value: 22 +- accuracy: 0.945 + +# Wrong +- value: '22' # never quote numbers +- accuracy: "0.945" +``` + +## Anti-Fabrication + +If a field is not visible in source, **omit the key entirely**. + +```yaml +# Wrong +doi: TODO +venue: TBD + +# Correct (just omit the keys) +title: "Paper Title" +authors: ["A. Author"] +``` + +Applies to: `doi`, `venue`, `year`, ORCIDs, GitHub URLs, dataset URLs. + +## ID Format + +Descriptive kebab-case with type prefix. **Never** use opaque IDs. + +```yaml +# Correct +- id: stmt:standard-transformer-fails-unseen-tokens +- id: ev:lemma-41-contraction-proof +- id: rel:collapse-causes-failure +- id: art:paper +- id: rep:paper-pdf + +# Wrong +- id: stmt:c1 +- id: ev:001 +- id: rel:r-23 +``` + +## Predicates + +Verified frequencies across 330 relations in 15 production sidecars: + +| Predicate | Direction | Use | +|-----------|-----------|-----| +| `supported_by` | claim → evidence | most common; primary evidence wiring | +| `depends_on` | claim/method → assumption | preconditions | +| `evaluates_on` | method → dataset/benchmark | empirical evaluation target | +| `limited_by` | claim → limitation | acknowledged limit | +| `documents` | statement → artifact (paper) | references the source paper | +| `uses` | method → artifact (model/software) | active usage | +| `used_by` | artifact → method | reverse direction of `uses` | +| `challenged_by` | claim → counter-claim | opposed/debated | +| `cites` | artifact → cited paper | citation graph | +| `implements` | method → repository | code implementation | +| `defines` | statement → definition | term introduction | + +Less common predicates valid per upstream natural-language doc (not yet observed): + +``` +extends, contradicts, critiques, generalizes, +specializes, introduces, refutes, replicates +``` + +```yaml +# Wrong tense (warning) +predicate: evaluated_on + +# Correct +predicate: evaluates_on + +# Passive forms accepted in production +predicate: supported_by +predicate: used_by +predicate: challenged_by +``` + +## Relation Wiring (CRITICAL) + +Average **≥1.5 relations per statement**. Minimum patterns: + +| Subject | Predicate | Object | +|---------|-----------|--------| +| Claim (statement) | `supported_by` | Evidence | +| Claim | `depends_on` | Assumption (statement) | +| Method (statement_type=method) | `implements` | Repository (artifact) | +| Method | `uses` | Model/Dataset (artifact) | +| Limitation | `limited_by` | Specific cause (statement/evidence) | + +## Common Mistakes Cheatsheet + +| Wrong | Correct | +|-------|---------| +| `metadata: {title: ...}` | top-level `title:` | +| `provenance.actors: [...]` | `provenance.actor: {...}` (single) | +| `confidence: high` | `confidence: {claim_strength: high, extraction_fidelity: high}` | +| `coverage: exhaustive` | `coverage: {statements: exhaustive, evidence: ...}` | +| `type: ai` | `type: tool` | +| `value: '22'` | `value: 22` | +| `type: paper` (on artifact) | `artifact_type: paper` | +| `type: claim` (on statement) | `statement_type: claim` | +| `evaluated_on` | `evaluates_on` | +| `doi: TODO` | omit `doi` key entirely | +| `id: stmt:c1` | `id: stmt:descriptive-name` | + +## Statement Density Guidance + +- Short methods paper: 8-12 statements +- Standard ML paper: 12-18 statements +- Complex/long paper: **15+ required** +- Survey/review paper: 20+ recommended + +If under-extracted, re-read: +- Limitations sections +- Discussion/conclusions +- Footnotes and ablation studies diff --git a/.agents/skills/oma-scholar/resources/upstream-spec-cache.md b/.agents/skills/oma-scholar/resources/upstream-spec-cache.md new file mode 100644 index 0000000..d2ba4dd --- /dev/null +++ b/.agents/skills/oma-scholar/resources/upstream-spec-cache.md @@ -0,0 +1,185 @@ +# Upstream Spec Cache + +This file is a **snapshot** of the canonical Knows skill description. +Source: `https://knows.academy/api/proxy/skill/knows.md` + +## How to refresh + +```bash +curl -s https://knows.academy/api/proxy/skill/knows.md \ + > .agents/skills/oma-scholar/resources/upstream-spec-cache.md.new +diff .agents/skills/oma-scholar/resources/upstream-spec-cache.md \ + .agents/skills/oma-scholar/resources/upstream-spec-cache.md.new +``` + +If the diff shows meaningful changes, update: +1. This file with the new content +2. `sidecar-spec.md` if rules changed +3. `SKILL.md` if mode descriptions changed + +Recommend refreshing every 1-2 weeks until the upstream stabilizes. + +--- + +## Snapshot — captured 2026-04-25 + +```markdown +# Knows Sidecar Skill — Complete Reference + +## Overview + +Knows is a structured YAML companion specification for research papers that enables LLM agents to access claims, evidence, and relations directly. A KnowsRecord is a schema-validated sidecar file that sits alongside PDFs. + +## Core Modes + +**Generate**: Create `.knows.yaml` from paper text, LaTeX, or research ideas +**Validate**: Run structural checks via `scripts/lint.py` +**Review**: Generate peer reviews as sidecars +**Analyze/Query**: Summarize or answer questions from existing sidecars +**Compare**: Diff two papers structurally +**Remote**: Search/download sidecars from knows.academy platform + +## Critical Rules for Generation + +### Field Naming (Copy Exactly) +- Statements: `statement_type` (not `type` or `claim`) +- Evidence: `evidence_type` (not `type`) +- Relations: `predicate` (not `type`) +- Artifacts: `artifact_type` (not `type`) +- Actors: `type: tool|person|org` (never `ai`, `llm`, `model`) + +### Value Constraints +- Numbers unquoted: `value: 22` not `value: '22'` +- Actor origin: `origin: machine` for AI-generated, `author` for human curation +- Confidence: `high|medium|low` only +- Coverage statements: `exhaustive|main_claims_only|key_claims_and_limitations|partial` +- Artifact roles: `subject|supporting|cited` only + +### Anti-Fabrication +"If the exact DOI is not visible in the PDF text, omit the `doi` key entirely. Do NOT write `doi: TODO`." Same principle applies to venue and year — omit unknown fields rather than placeholder them. + +### Relation Wiring +Every statement needs ≥1.5 relations per statement on average. Minimum patterns: +- Claims: `supported_by` evidence, optionally `depends_on` assumptions +- Evidence: must be `object_ref` of at least one relation +- Methods: `evaluates_on` dataset, `implements` repo, or `documents` paper + +### ID Format +Use descriptive kebab-case with prefix: `stmt:privacy-budget-tradeoff`, `ev:cifar10-accuracy-table`, `rel:ablation-supports-claim` — never numbered IDs like `stmt:c1`. + +## Post-Generation Checklist + +1. Verify statement count (complex papers need 15+) +2. Wire relations systematically per required patterns +3. Run sanitize if YAML won't parse: `python3 scripts/sanitize.py` +4. Lint validation: `python3 scripts/lint.py` until 0 errors +5. Verify metadata: `python3 scripts/verify_metadata.py` (with `--auto-enrich` if DOI missing) + +## Common Mistakes + +| Error | Wrong | Correct | +|---|---|---| +| Actor type | `type: ai` | `type: tool` | +| Observation value | `'22'` | `22` | +| Artifact field | `type: paper` | `artifact_type: paper` | +| Wrong tense | `evaluated_on` | `evaluates_on` | +| Fabricated DOI | `doi: "TODO"` | Omit entirely | +| Missing metric | `qualitative_value: "..."` | Add `metric: "name"` | + +## Dependencies + +- **Always available**: YAML template, JSON Schema, generation prompt +- **For lint**: `pip install pyyaml jsonschema` +- **For CLI**: `pip install knows-sidecar` +- **For LLM generation**: `pip install anthropic` +- **For verify**: Free OpenAlex/CrossRef API (set `OPENALEX_API_KEY` in `~/.claude/.env`) +``` + +--- + +## Local Adaptations + +Where our local skill diverges from upstream — recorded for spec drift tracking: + +1. **Generation engine**: upstream suggests `pip install anthropic`; we use the host LLM directly (no subprocess SDK call). Cost and key savings. +2. **Validation tooling**: upstream references `pip install knows-sidecar`; that package is **not yet on PyPI** (verified 2026-04-25). We ship rule-based validation as the `oma scholar lint` CLI subcommand (TypeScript, no Python dependency). +3. **JSON Schema host unreachable**: production sidecars reference `https://knows.dev/schema/record-0.9.json` but the host's HTTPS port times out. We cannot redistribute or fetch the schema, so lint encodes empirically-derived rules instead. +4. **Submission**: not implemented. This skill is read/generate-only, not a publisher. + +## v0.9.0 Production Spec — Differences From Upstream `knows.md` + +The upstream `knows.md` document above describes a simplified shape; production +sidecars served by `knows.academy/api/proxy/sidecars/*` follow the v0.9.0 schema +which differs significantly. **Our `sidecar-spec.md` reflects v0.9.0 production**. +Key differences: + +| Field | Upstream `knows.md` | v0.9.0 Production | +|-------|---------------------|-------------------| +| Title/authors location | `metadata.title`, `metadata.authors` | top-level `title`, `authors`, `venue`, `year` (no `metadata` wrapper) | +| Provenance attribution | `provenance.actors[]` array | `provenance.actor` (single object) | +| Confidence shape | string `high\|medium\|low` | object `{claim_strength, extraction_fidelity}` | +| Coverage shape | single string at `provenance.coverage` | top-level `coverage` object with `statements` + `evidence` keys | +| Coverage evidence enum | (not specified) | `exhaustive` \| `key_evidence_only` \| `partial` | +| Partial fetch param | (not specified) | **`section=`** (singular), enum `statements\|evidence\|relations\|artifacts\|citation` | + +## v0.9.0 Production-Only Fields + +Not mentioned in upstream `knows.md` but present in every production sidecar: + +- `$schema` — URL of the JSON Schema (currently unreachable but referenced) +- `knows_version` — e.g., `"0.9.0"` +- `record_id` — `knows:generated/{slug}/{version}` for published records +- `profile` — e.g., `"paper@1"` +- `subject_ref` — points to the artifact representing the paper itself +- `summary` — one-paragraph overview +- `license` — e.g., `"CC-BY-4.0"` +- `actions` — list (typically empty for paper profile) +- `version` — `{spec, record, source}` +- `freshness` — `{as_of, update_policy}` + +## Statement Extra Fields (v0.9.0) + +Not mentioned in upstream `knows.md`: + +- `modality` — `empirical` \| `theoretical` \| `descriptive` +- `about_ref` — what the statement is about (usually the subject artifact) +- `status` — observed value `asserted` +- `source_anchors` — list of `{representation_ref, locator_type, locator}` pointers +- `provenance` — per-statement provenance block (same shape as top-level) + +## Artifact Extra Fields (v0.9.0) + +- `identifiers` — `{url, doi, ...}` (omit unknown keys) +- `representations` — list of `{id: rep:..., media_type, locator: {type, value}}` + +## Predicate Vocabulary (Verified in Production) + +Across 330 relations in 15 production sidecars, the following predicates +appear (descending frequency): + +``` +supported_by, depends_on, evaluates_on, limited_by, documents, +uses, challenged_by, cites, implements, used_by, defines +``` + +Other predicates from upstream natural-language doc, valid per the spec but +not yet observed in our sample: `extends`, `contradicts`, `critiques`, +`generalizes`, `specializes`, `introduces`, `refutes`, `replicates`. + +## Vocabulary Survey (15 production sidecars / 2026-04-25) + +Field enums verified by frequency, all production AI-generated (`origin: machine`): + +| Field | Verified values | +|-------|-----------------| +| `knows_version` | `0.9.0` (only one observed) | +| `profile` | `paper@1` (only one observed) | +| `provenance.method` | `extraction` (only one observed) | +| `coverage.statements` | `exhaustive`, `main_claims_only` | +| `coverage.evidence` | `key_evidence_only` | +| `statement_type` | `claim`, `method`, `limitation`, `assumption`, `definition`, `question` | +| `statement.modality` | `descriptive`, `empirical`, `theoretical` | +| `statement.status` | `asserted` (only value across 252 statements) | +| `evidence_type` | `table_result`, `figure`, `proof`, `observation`, `experiment_run`, `case_study`, `citation_backed` | +| `artifact_type` | `paper`, `dataset`, `benchmark`, `repository`, `model`, `software`, `other` | +| `artifact.role` | `subject`, `supporting`, `cited` | diff --git a/.agents/workflows/brainstorm.md b/.agents/workflows/brainstorm.md index 73d75e0..109bdd2 100644 --- a/.agents/workflows/brainstorm.md +++ b/.agents/workflows/brainstorm.md @@ -66,7 +66,37 @@ Each section requires explicit user approval before moving to the next. --- -## Step 5: Save Design Document +## Step 5: Blind Review Round + +Before saving the design, run an independent critique round to surface suppressed issues. + +Groupthink and authority bias hide real gaps. A blind round where each perspective critiques independently — without seeing others' feedback — surfaces issues the consensus round would have buried. + +**Procedure:** + +1. **Select 4-8 independent reviewer lenses** appropriate to the design domain. Examples: + - Software skill: backend, frontend, devops, security, QA, CTO, end-user, docs-writer + - Infra skill: network, system, security, finops, SRE, compliance, CTO + - Customize to the feature's stakeholder map. + +2. **Independent critique**: for each lens, produce 2-3 concrete criticisms of the Step 4 design without reference to other lenses' feedback. Cover missing items in their specialty, overlaps/redundancies, naming issues, implementation risks. + +3. **Consolidate and dedupe** into a unique issue list. Classify: + - **Tier 1** — critical gap, must resolve before save + - **Tier 2** — enhancement, should resolve or explicitly defer + - **Tier 3** — nice-to-have, defer to next version + +4. **Check for suppressed compromises**: for each prior design decision where a reviewer voted `⚠️→✅`, verify the objection was answered on principle (regulatory, consumer, architectural) rather than overridden by majority. Restore any principled objection that was suppressed. + +5. **Resolve Tier 1 issues** by updating Step 4 design — either new sections in existing files, new files, or explicit out-of-scope declarations. + +6. **Present resolved design** to the user for final approval before Step 6. + +Skip only if the design is trivially small (1-2 files, low stakes). Otherwise mandatory. + +--- + +## Step 6: Save Design Document // turbo Save the approved design: @@ -75,7 +105,7 @@ Save the approved design: --- -## Step 6: Transition to Planning +## Step 7: Transition to Planning Inform the user that the design phase is complete and suggest: > "Design approved. Run `/plan` to decompose this into actionable tasks." diff --git a/.agents/workflows/orchestrate.md b/.agents/workflows/orchestrate.md index d9898eb..9d73df4 100644 --- a/.agents/workflows/orchestrate.md +++ b/.agents/workflows/orchestrate.md @@ -164,8 +164,15 @@ bash .agents/skills/oma-orchestrator/scripts/verify.sh {agent-type} {workspace} ``` - PASS (exit 0): accept result. If Quality Score is active, measure and record in Experiment Ledger. -- FAIL (exit 1): re-spawn with error context (max 2 retries). -- FAIL (after 2 retries): Activate **Exploration Loop** (load `exploration-loop.md` per `context-loading.md`): +- FAIL (exit 1): Before re-spawning, apply the Review Loop termination check: + + > **Review Loop termination conditions (OR — whichever fires first wins)** + > 1. Retry count for this agent has reached the configured maximum (default: 2 retries). Do not start another retry cycle. + > 2. Session cost cap exceeded: call `checkCap(sessionId, loadQuotaCap())` from `cli/io/session-cost.ts`. If `exceeded === true`, print `formatPromptMessage(result)` to the user and stop the loop immediately — save the current agent's partial results before stopping, then report early termination due to quota. Do not spawn the next retry or any remaining agents in the tier. + > + > If neither condition is met, re-spawn the agent with error context and increment the retry counter. + +- FAIL (after 2 retries, and cost cap not yet exceeded): Activate **Exploration Loop** (load `exploration-loop.md` per `context-loading.md`): 1. Generate 2-3 alternative hypotheses for the failing task 2. Spawn the **same agent type** with different hypothesis prompts (parallel, separate workspaces) 3. Score each result with Quality Score (if available) diff --git a/.agents/workflows/ultrawork.md b/.agents/workflows/ultrawork.md index 73f9ae0..9d8e611 100644 --- a/.agents/workflows/ultrawork.md +++ b/.agents/workflows/ultrawork.md @@ -200,9 +200,15 @@ If baseline was measured at Step 5.2: **On gate pass**: Use memory edit tool to record phase completion in `session-ultrawork.md` -**Gate failure (1st time)** → Return to Step 5, fix implementation issues, and repeat VERIFY phase. +**Gate failure (1st time)** → Before re-spawning for the next VERIFY cycle, check the session cost cap: -**Gate failure (2nd time on same issue)** → Activate **Exploration Loop**: +> **Review Loop termination conditions (OR — whichever fires first wins)** +> 1. Gate failure count has reached the configured maximum iterations (default: 5 total VERIFY + REFINE cycles). Do not start another cycle. +> 2. Session cost cap exceeded: call `checkCap(sessionId, loadQuotaCap())` from `cli/io/session-cost.ts`. If `exceeded === true`, print `formatPromptMessage(result)` to the user and stop the loop immediately — save all current step results before stopping, then report to the user that the loop was terminated early due to quota. +> +> If neither condition is met, return to Step 5 and continue. + +**Gate failure (2nd time on same issue, and termination conditions not yet met)** → Activate **Exploration Loop**: 1. Load `exploration-loop.md` (conditional, per `context-loading.md`) 2. Generate 2-3 alternative hypotheses using Exploration Decision template (`reasoning-templates.md` #6) 3. Experiment each approach sequentially (git stash per attempt) @@ -276,7 +282,13 @@ If baseline was measured at Step 5.2: **On gate pass**: Use memory edit tool to record phase completion in `session-ultrawork.md` -**Gate failure → Re-spawn Debug Agent with specific issues and repeat until GATE passes.** +**Gate failure → Before re-spawning the Debug Agent, apply the same termination check:** + +> **Review Loop termination conditions (OR — whichever fires first wins)** +> 1. Total REFINE failure count has reached the configured maximum iterations (default: 5 cycles across all phases). Do not start another cycle. +> 2. Session cost cap exceeded: call `checkCap(sessionId, loadQuotaCap())` from `cli/io/session-cost.ts`. If `exceeded === true`, print `formatPromptMessage(result)` to the user and stop — save current step results before stopping, then report early termination due to quota. +> +> If neither condition is met, re-spawn the Debug Agent with specific issues and repeat until GATE passes. **Skip conditions**: Simple tasks < 50 lines diff --git a/.claude/agents/architecture-reviewer.md b/.claude/agents/architecture-reviewer.md index 66f9286..155bd24 100644 --- a/.claude/agents/architecture-reviewer.md +++ b/.claude/agents/architecture-reviewer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-architecture.md` (orchestrated: `result-architecture-{sessionId}.md`) - Include: status, recommendation summary, tradeoffs, risks, validation steps, artifacts created +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY recommendations or structural edits, output this block: @@ -36,6 +38,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT change architecture or code +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.claude/agents/backend-engineer.md b/.claude/agents/backend-engineer.md index c5219e4..2515eac 100644 --- a/.claude/agents/backend-engineer.md +++ b/.claude/agents/backend-engineer.md @@ -19,6 +19,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-backend.md` (orchestrated: `result-backend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -35,6 +37,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.claude/agents/db-engineer.md b/.claude/agents/db-engineer.md index 7e4b0ec..15d2f96 100644 --- a/.claude/agents/db-engineer.md +++ b/.claude/agents/db-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-db.md` (orchestrated: `result-db-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.claude/agents/debug-investigator.md b/.claude/agents/debug-investigator.md index 608664d..5b05558 100644 --- a/.claude/agents/debug-investigator.md +++ b/.claude/agents/debug-investigator.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-debug.md` (orchestrated: `result-debug-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -36,6 +38,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Diagnosis Process diff --git a/.claude/agents/frontend-engineer.md b/.claude/agents/frontend-engineer.md index c6f5a7f..1013450 100644 --- a/.claude/agents/frontend-engineer.md +++ b/.claude/agents/frontend-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-frontend.md` (orchestrated: `result-frontend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.claude/agents/mobile-engineer.md b/.claude/agents/mobile-engineer.md index 2e2954e..c77ff81 100644 --- a/.claude/agents/mobile-engineer.md +++ b/.claude/agents/mobile-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-mobile.md` (orchestrated: `result-mobile-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.claude/agents/pm-planner.md b/.claude/agents/pm-planner.md index 07eb716..1342aba 100644 --- a/.claude/agents/pm-planner.md +++ b/.claude/agents/pm-planner.md @@ -19,6 +19,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-pm.md` (orchestrated: `result-pm-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY planning work, output this block: @@ -35,6 +37,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT proceed +<!-- CHARTER_CHECK_END --> ## Planning Process diff --git a/.claude/agents/qa-reviewer.md b/.claude/agents/qa-reviewer.md index 6c719fd..2fda2e2 100644 --- a/.claude/agents/qa-reviewer.md +++ b/.claude/agents/qa-reviewer.md @@ -4,7 +4,6 @@ description: OWASP security, performance, accessibility, code quality review age tools: Read, Grep, Glob, Bash model: sonnet maxTurns: 15 -effort: low skills: - oma-qa --- @@ -20,6 +19,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-qa.md` (orchestrated: `result-qa-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before starting review, output this block: @@ -32,6 +33,7 @@ CHARTER_CHECK: - Must NOT do: modify source code, skip severity levels, report unverified findings - Success criteria: {all files reviewed, findings with file:line references} ``` +<!-- CHARTER_CHECK_END --> ## Review Priority Order diff --git a/.claude/agents/tf-infra-engineer.md b/.claude/agents/tf-infra-engineer.md index 0344c89..9922df5 100644 --- a/.claude/agents/tf-infra-engineer.md +++ b/.claude/agents/tf-infra-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/claude.md`: - Write results to project root `.agents/results/result-tf-infra.md` (orchestrated: `result-tf-infra-{sessionId}.md`) - Include: status, summary, files changed, validation results, plan/apply notes, acceptance checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY infrastructure changes, output this block: @@ -36,6 +38,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT apply destructive changes +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.claude/hooks/hud.ts b/.claude/hooks/hud.ts index 597b95c..73f0ac4 100644 --- a/.claude/hooks/hud.ts +++ b/.claude/hooks/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.claude/hooks/keyword-detector.ts b/.claude/hooks/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.claude/hooks/keyword-detector.ts +++ b/.claude/hooks/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record<string, unknown>): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record<string, unknown>).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record<string, string[]> + persistent: boolean; + keywords: Record<string, string[]>; } - > - informationalPatterns: Record<string, string[]> - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record<string, string[]> + >; + informationalPatterns: Record<string, string[]>; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record<string, string[]>; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record<string, string[]>, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record<string, string[]>, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set<string>() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set<string>(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record<string, string[]>): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record<string, string[]>, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map<string, number>() + const scores = new Map<string, number>(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record<string, string[]> = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.claude/hooks/persistent-mode.ts b/.claude/hooks/persistent-mode.ts index 4936f4e..311035a 100644 --- a/.claude/hooks/persistent-mode.ts +++ b/.claude/hooks/persistent-mode.ts @@ -13,125 +13,170 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record<string, { persistent: boolean }> + workflows: Record<string, { persistent: boolean }>; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" - if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "AfterAgent") return "gemini"; + if (event === "Stop" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +189,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.claude/hooks/skill-injector.ts b/.claude/hooks/skill-injector.ts index beda327..9ccce70 100644 --- a/.claude/hooks/skill-injector.ts +++ b/.claude/hooks/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record<string, { keywords: Record<string, string[]> }> - cjkScripts?: string[] + skills?: Record<string, { keywords: Record<string, string[]> }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType<typeof readdirSync> + const out: SkillEntry[] = []; + let entries: ReturnType<typeof readdirSync>; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set<string>() - const allTriggers: string[] = [] + const seen = new Set<string>(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record<string, { injected: string[]; timestamp: number }> + sessions: Record<string, { injected: string[]; timestamp: number }>; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.claude/hooks/test-filter.ts b/.claude/hooks/test-filter.ts index a0ce2fc..a3ad992 100644 --- a/.claude/hooks/test-filter.ts +++ b/.claude/hooks/test-filter.ts @@ -1,51 +1,61 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" - if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeTool") return "gemini"; + if (event === "PreToolUse" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +88,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record<string, unknown> = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.claude/hooks/triggers.json b/.claude/hooks/triggers.json index f404583..0a1513f 100644 --- a/.claude/hooks/triggers.json +++ b/.claude/hooks/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.claude/hooks/types.ts b/.claude/hooks/types.ts index f9bf420..2b79035 100644 --- a/.claude/hooks/types.ts +++ b/.claude/hooks/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, unknown>): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record<string, unknown>, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,7 +120,7 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); case "claude": case "codex": case "qwen": @@ -123,15 +129,15 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); } } // --- Shared Types --- export interface ModeState { - workflow: string - sessionId: string - activatedAt: string - reinforcementCount: number + workflow: string; + sessionId: string; + activatedAt: string; + reinforcementCount: number; } diff --git a/.claude/settings.json b/.claude/settings.json index 2dc8d57..0b014ea 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -49,7 +49,12 @@ "command": "bun \"$CLAUDE_PROJECT_DIR/.claude/hooks/hud.ts\"" }, "permissions": { - "allow": ["Bash(bun run:*)", "Bash(bunx tsx:*)", "Bash(git add:*)", "mcp__serena__*"] + "allow": [ + "Bash(bun run:*)", + "Bash(bunx tsx:*)", + "Bash(git add:*)", + "mcp__serena__*" + ] }, "env": { "cleanupPeriodDays": 180, diff --git a/.claude/skills/oma-image b/.claude/skills/oma-image new file mode 120000 index 0000000..d5817b7 --- /dev/null +++ b/.claude/skills/oma-image @@ -0,0 +1 @@ +../../.agents/skills/oma-image \ No newline at end of file diff --git a/.claude/skills/oma-observability b/.claude/skills/oma-observability new file mode 120000 index 0000000..d3cfa7d --- /dev/null +++ b/.claude/skills/oma-observability @@ -0,0 +1 @@ +../../.agents/skills/oma-observability \ No newline at end of file diff --git a/.claude/skills/oma-scholar b/.claude/skills/oma-scholar new file mode 120000 index 0000000..528228d --- /dev/null +++ b/.claude/skills/oma-scholar @@ -0,0 +1 @@ +../../.agents/skills/oma-scholar \ No newline at end of file diff --git a/.codex/agents/architecture-reviewer.toml b/.codex/agents/architecture-reviewer.toml index 75d32b8..c495384 100644 --- a/.codex/agents/architecture-reviewer.toml +++ b/.codex/agents/architecture-reviewer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-architecture.md` (orchestrated: `result-architecture-{sessionId}.md`) - Include: status, recommendation summary, tradeoffs, risks, validation steps, artifacts created +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY recommendations or structural edits, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT change architecture or code +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.codex/agents/backend-engineer.toml b/.codex/agents/backend-engineer.toml index 7e0fd46..0c7995e 100644 --- a/.codex/agents/backend-engineer.toml +++ b/.codex/agents/backend-engineer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-backend.md` (orchestrated: `result-backend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.codex/agents/db-engineer.toml b/.codex/agents/db-engineer.toml index d53e1ca..726f694 100644 --- a/.codex/agents/db-engineer.toml +++ b/.codex/agents/db-engineer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-db.md` (orchestrated: `result-db-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.codex/agents/debug-investigator.toml b/.codex/agents/debug-investigator.toml index 34ee8e9..7484491 100644 --- a/.codex/agents/debug-investigator.toml +++ b/.codex/agents/debug-investigator.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-debug.md` (orchestrated: `result-debug-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Diagnosis Process diff --git a/.codex/agents/frontend-engineer.toml b/.codex/agents/frontend-engineer.toml index 46378a2..ae9a0fb 100644 --- a/.codex/agents/frontend-engineer.toml +++ b/.codex/agents/frontend-engineer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-frontend.md` (orchestrated: `result-frontend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.codex/agents/mobile-engineer.toml b/.codex/agents/mobile-engineer.toml index c7bc3d0..8c35744 100644 --- a/.codex/agents/mobile-engineer.toml +++ b/.codex/agents/mobile-engineer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-mobile.md` (orchestrated: `result-mobile-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.codex/agents/pm-planner.toml b/.codex/agents/pm-planner.toml index 0212c61..da0ec44 100644 --- a/.codex/agents/pm-planner.toml +++ b/.codex/agents/pm-planner.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-pm.md` (orchestrated: `result-pm-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY planning work, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT proceed +<!-- CHARTER_CHECK_END --> ## Planning Process diff --git a/.codex/agents/qa-reviewer.toml b/.codex/agents/qa-reviewer.toml index 8b57a13..02c6da3 100644 --- a/.codex/agents/qa-reviewer.toml +++ b/.codex/agents/qa-reviewer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-qa.md` (orchestrated: `result-qa-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before starting review, output this block: @@ -25,6 +27,7 @@ CHARTER_CHECK: - Must NOT do: modify source code, skip severity levels, report unverified findings - Success criteria: {all files reviewed, findings with file:line references} ``` +<!-- CHARTER_CHECK_END --> ## Review Priority Order diff --git a/.codex/agents/tf-infra-engineer.toml b/.codex/agents/tf-infra-engineer.toml index bf74059..41af828 100644 --- a/.codex/agents/tf-infra-engineer.toml +++ b/.codex/agents/tf-infra-engineer.toml @@ -13,6 +13,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/codex.md`: - Write results to project root `.agents/results/result-tf-infra.md` (orchestrated: `result-tf-infra-{sessionId}.md`) - Include: status, summary, files changed, validation results, plan/apply notes, acceptance checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY infrastructure changes, output this block: @@ -29,6 +31,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT apply destructive changes +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.codex/config.toml b/.codex/config.toml index 250e620..ff12a75 100644 --- a/.codex/config.toml +++ b/.codex/config.toml @@ -1,4 +1,10 @@ - - [features] codex_hooks = true + +[mcp_servers.serena] +command = "uvx" +args = [ "--from", "git+https://github.com/oraios/serena", "serena", "start-mcp-server", "--context", "codex", "--project", "." ] + +[mcp_servers.serena.env] +SERENA_LOG_LEVEL = "info" + diff --git a/.codex/hooks/hud.ts b/.codex/hooks/hud.ts index 597b95c..73f0ac4 100644 --- a/.codex/hooks/hud.ts +++ b/.codex/hooks/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.codex/hooks/keyword-detector.ts b/.codex/hooks/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.codex/hooks/keyword-detector.ts +++ b/.codex/hooks/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record<string, unknown>): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record<string, unknown>).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record<string, string[]> + persistent: boolean; + keywords: Record<string, string[]>; } - > - informationalPatterns: Record<string, string[]> - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record<string, string[]> + >; + informationalPatterns: Record<string, string[]>; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record<string, string[]>; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record<string, string[]>, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record<string, string[]>, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set<string>() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set<string>(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record<string, string[]>): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record<string, string[]>, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map<string, number>() + const scores = new Map<string, number>(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record<string, string[]> = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.codex/hooks/persistent-mode.ts b/.codex/hooks/persistent-mode.ts index 4936f4e..311035a 100644 --- a/.codex/hooks/persistent-mode.ts +++ b/.codex/hooks/persistent-mode.ts @@ -13,125 +13,170 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record<string, { persistent: boolean }> + workflows: Record<string, { persistent: boolean }>; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" - if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "AfterAgent") return "gemini"; + if (event === "Stop" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +189,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.codex/hooks/skill-injector.ts b/.codex/hooks/skill-injector.ts index beda327..9ccce70 100644 --- a/.codex/hooks/skill-injector.ts +++ b/.codex/hooks/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record<string, { keywords: Record<string, string[]> }> - cjkScripts?: string[] + skills?: Record<string, { keywords: Record<string, string[]> }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType<typeof readdirSync> + const out: SkillEntry[] = []; + let entries: ReturnType<typeof readdirSync>; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set<string>() - const allTriggers: string[] = [] + const seen = new Set<string>(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record<string, { injected: string[]; timestamp: number }> + sessions: Record<string, { injected: string[]; timestamp: number }>; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.codex/hooks/test-filter.ts b/.codex/hooks/test-filter.ts index a0ce2fc..a3ad992 100644 --- a/.codex/hooks/test-filter.ts +++ b/.codex/hooks/test-filter.ts @@ -1,51 +1,61 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" - if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeTool") return "gemini"; + if (event === "PreToolUse" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +88,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record<string, unknown> = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.codex/hooks/triggers.json b/.codex/hooks/triggers.json index f404583..0a1513f 100644 --- a/.codex/hooks/triggers.json +++ b/.codex/hooks/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.codex/hooks/types.ts b/.codex/hooks/types.ts index f9bf420..fd54f3e 100644 --- a/.codex/hooks/types.ts +++ b/.codex/hooks/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, unknown>): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record<string, unknown>, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,24 +120,27 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); case "claude": - case "codex": - case "qwen": return JSON.stringify({ hookSpecificOutput: { hookEventName: "PreToolUse", updatedInput, }, - }) + }); + case "codex": + case "qwen": + return JSON.stringify({ + updated_input: updatedInput, + }); } } // --- Shared Types --- export interface ModeState { - workflow: string - sessionId: string - activatedAt: string - reinforcementCount: number + workflow: string; + sessionId: string; + activatedAt: string; + reinforcementCount: number; } diff --git a/.codex/skills/architecture/SKILL.md b/.codex/skills/architecture/SKILL.md new file mode 100644 index 0000000..91bd999 --- /dev/null +++ b/.codex/skills/architecture/SKILL.md @@ -0,0 +1,7 @@ +--- +name: architecture +description: Software architecture workflow — diagnose architecture problems, select the right analysis method, compare options, synthesize stakeholder input, and produce a recommendation, review, or ADR +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/architecture.md` step by step. diff --git a/.codex/skills/brainstorm/SKILL.md b/.codex/skills/brainstorm/SKILL.md new file mode 100644 index 0000000..232e137 --- /dev/null +++ b/.codex/skills/brainstorm/SKILL.md @@ -0,0 +1,7 @@ +--- +name: brainstorm +description: Design-first ideation workflow — explore user intent, clarify constraints, propose approaches, and produce an approved design document before planning +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/brainstorm.md` step by step. diff --git a/.codex/skills/debug/SKILL.md b/.codex/skills/debug/SKILL.md new file mode 100644 index 0000000..05de919 --- /dev/null +++ b/.codex/skills/debug/SKILL.md @@ -0,0 +1,7 @@ +--- +name: debug +description: Structured bug diagnosis and fixing workflow — reproduce, diagnose root cause, apply minimal fix, write regression test, and scan for similar patterns +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/debug.md` step by step. diff --git a/.codex/skills/deepinit/SKILL.md b/.codex/skills/deepinit/SKILL.md new file mode 100644 index 0000000..0168396 --- /dev/null +++ b/.codex/skills/deepinit/SKILL.md @@ -0,0 +1,7 @@ +--- +name: deepinit +description: Initialize project harness — AGENTS.md as table of contents, ARCHITECTURE.md as domain map, structured docs/ knowledge base +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/deepinit.md` step by step. diff --git a/.codex/skills/design/SKILL.md b/.codex/skills/design/SKILL.md new file mode 100644 index 0000000..5c56283 --- /dev/null +++ b/.codex/skills/design/SKILL.md @@ -0,0 +1,7 @@ +--- +name: design +description: Design workflow — create design systems, DESIGN.md, and design tokens with anti-pattern enforcement and accessibility checks +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/design.md` step by step. diff --git a/.codex/skills/exec-plan/SKILL.md b/.codex/skills/exec-plan/SKILL.md new file mode 100644 index 0000000..2ba4b4a --- /dev/null +++ b/.codex/skills/exec-plan/SKILL.md @@ -0,0 +1,7 @@ +--- +name: exec-plan +description: Create, manage, and track execution plans as first-class repository artifacts in docs/exec-plans/ +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/exec-plan.md` step by step. diff --git a/.codex/skills/orchestrate/SKILL.md b/.codex/skills/orchestrate/SKILL.md new file mode 100644 index 0000000..c3ce3e1 --- /dev/null +++ b/.codex/skills/orchestrate/SKILL.md @@ -0,0 +1,7 @@ +--- +name: orchestrate +description: Automated CLI-based parallel agent execution — spawn subagents via Gemini CLI, coordinate through MCP Memory, monitor progress, and run verification +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/orchestrate.md` step by step. diff --git a/.codex/skills/pdf/SKILL.md b/.codex/skills/pdf/SKILL.md new file mode 100644 index 0000000..32fd0d3 --- /dev/null +++ b/.codex/skills/pdf/SKILL.md @@ -0,0 +1,7 @@ +--- +name: pdf +description: Convert PDF to Markdown using opendataloader-pdf — extracts text, tables, headings, and images with correct reading order +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/pdf.md` step by step. diff --git a/.codex/skills/plan/SKILL.md b/.codex/skills/plan/SKILL.md new file mode 100644 index 0000000..4f30276 --- /dev/null +++ b/.codex/skills/plan/SKILL.md @@ -0,0 +1,7 @@ +--- +name: plan +description: PM planning workflow — analyze requirements, select tech stack, decompose into prioritized tasks with dependencies, and define API contracts +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/plan.md` step by step. diff --git a/.codex/skills/ralph/SKILL.md b/.codex/skills/ralph/SKILL.md new file mode 100644 index 0000000..00eb19d --- /dev/null +++ b/.codex/skills/ralph/SKILL.md @@ -0,0 +1,7 @@ +--- +name: ralph +description: Ralph - persistent self-referential execution loop wrapping ultrawork with independent verifier verification +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/ralph.md` step by step. diff --git a/.codex/skills/review/SKILL.md b/.codex/skills/review/SKILL.md new file mode 100644 index 0000000..9a91770 --- /dev/null +++ b/.codex/skills/review/SKILL.md @@ -0,0 +1,7 @@ +--- +name: review +description: Full QA review pipeline — security audit (OWASP Top 10), performance analysis, accessibility check (WCAG 2.1 AA), and code quality review +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/review.md` step by step. diff --git a/.codex/skills/scm/SKILL.md b/.codex/skills/scm/SKILL.md new file mode 100644 index 0000000..28ccbcc --- /dev/null +++ b/.codex/skills/scm/SKILL.md @@ -0,0 +1,7 @@ +--- +name: scm +description: SCM workflow for Git operations (branching/merge/conflict/worktree) plus Conventional Commit execution. +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/scm.md` step by step. diff --git a/.codex/skills/stack-set/SKILL.md b/.codex/skills/stack-set/SKILL.md new file mode 100644 index 0000000..95438e1 --- /dev/null +++ b/.codex/skills/stack-set/SKILL.md @@ -0,0 +1,7 @@ +--- +name: stack-set +description: Auto-detect project tech stack and generate stack-specific references for domain skills +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/stack-set.md` step by step. diff --git a/.codex/skills/tools/SKILL.md b/.codex/skills/tools/SKILL.md new file mode 100644 index 0000000..76ee848 --- /dev/null +++ b/.codex/skills/tools/SKILL.md @@ -0,0 +1,7 @@ +--- +name: tools +description: Manage MCP tools with natural language commands — list, enable, disable tools and tool groups +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/tools.md` step by step. diff --git a/.codex/skills/ultrawork/SKILL.md b/.codex/skills/ultrawork/SKILL.md new file mode 100644 index 0000000..0fcf7ec --- /dev/null +++ b/.codex/skills/ultrawork/SKILL.md @@ -0,0 +1,7 @@ +--- +name: ultrawork +description: Ultrawork - high-quality 5-phase development workflow with 11 review steps out of 17 +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/ultrawork.md` step by step. diff --git a/.codex/skills/work/SKILL.md b/.codex/skills/work/SKILL.md new file mode 100644 index 0000000..c8c07b5 --- /dev/null +++ b/.codex/skills/work/SKILL.md @@ -0,0 +1,7 @@ +--- +name: work +description: Coordinate multiple agents for a complex multi-domain project using PM planning, parallel agent spawning, and QA review +--- +<!-- oma:generated --> + +Read and follow `.agents/workflows/work.md` step by step. diff --git a/.cursor/agents/architecture-reviewer.md b/.cursor/agents/architecture-reviewer.md index 419ed97..241b6fe 100644 --- a/.cursor/agents/architecture-reviewer.md +++ b/.cursor/agents/architecture-reviewer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-architecture.md` (orchestrated: `result-architecture-{sessionId}.md`) - Include: status, recommendation summary, tradeoffs, risks, validation steps, artifacts created +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY recommendations or structural edits, output this block: @@ -36,6 +38,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT change architecture or code +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.cursor/agents/backend-engineer.md b/.cursor/agents/backend-engineer.md index 2f0b380..ebf01fb 100644 --- a/.cursor/agents/backend-engineer.md +++ b/.cursor/agents/backend-engineer.md @@ -19,6 +19,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-backend.md` (orchestrated: `result-backend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -35,6 +37,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.cursor/agents/db-engineer.md b/.cursor/agents/db-engineer.md index aaf5591..9256682 100644 --- a/.cursor/agents/db-engineer.md +++ b/.cursor/agents/db-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-db.md` (orchestrated: `result-db-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.cursor/agents/debug-investigator.md b/.cursor/agents/debug-investigator.md index 7397170..60436c0 100644 --- a/.cursor/agents/debug-investigator.md +++ b/.cursor/agents/debug-investigator.md @@ -19,6 +19,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-debug.md` (orchestrated: `result-debug-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -35,6 +37,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Diagnosis Process diff --git a/.cursor/agents/frontend-engineer.md b/.cursor/agents/frontend-engineer.md index 255a71b..052e40f 100644 --- a/.cursor/agents/frontend-engineer.md +++ b/.cursor/agents/frontend-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-frontend.md` (orchestrated: `result-frontend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.cursor/agents/mobile-engineer.md b/.cursor/agents/mobile-engineer.md index 7ce4018..d71d1e5 100644 --- a/.cursor/agents/mobile-engineer.md +++ b/.cursor/agents/mobile-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-mobile.md` (orchestrated: `result-mobile-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.cursor/agents/pm-planner.md b/.cursor/agents/pm-planner.md index 7dba6ab..d63b230 100644 --- a/.cursor/agents/pm-planner.md +++ b/.cursor/agents/pm-planner.md @@ -19,6 +19,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-pm.md` (orchestrated: `result-pm-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY planning work, output this block: @@ -35,6 +37,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT proceed +<!-- CHARTER_CHECK_END --> ## Planning Process diff --git a/.cursor/agents/qa-reviewer.md b/.cursor/agents/qa-reviewer.md index 2877429..6ae8015 100644 --- a/.cursor/agents/qa-reviewer.md +++ b/.cursor/agents/qa-reviewer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-qa.md` (orchestrated: `result-qa-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before starting review, output this block: @@ -32,6 +34,7 @@ CHARTER_CHECK: - Must NOT do: modify source code, skip severity levels, report unverified findings - Success criteria: {all files reviewed, findings with file:line references} ``` +<!-- CHARTER_CHECK_END --> ## Review Priority Order diff --git a/.cursor/agents/tf-infra-engineer.md b/.cursor/agents/tf-infra-engineer.md index 79cdcc5..0857b2f 100644 --- a/.cursor/agents/tf-infra-engineer.md +++ b/.cursor/agents/tf-infra-engineer.md @@ -20,6 +20,8 @@ Follow `.agents/skills/_shared/core/quality-principles.md`: - Write results to project root `.agents/results/result-tf-infra.md` (orchestrated: `result-tf-infra-{sessionId}.md`) - Include: status, summary, files changed, validation results, plan/apply notes, acceptance checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY infrastructure changes, output this block: @@ -36,6 +38,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT apply destructive changes +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.cursor/hooks/hud.ts b/.cursor/hooks/hud.ts index 597b95c..73f0ac4 100644 --- a/.cursor/hooks/hud.ts +++ b/.cursor/hooks/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.cursor/hooks/keyword-detector.ts b/.cursor/hooks/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.cursor/hooks/keyword-detector.ts +++ b/.cursor/hooks/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record<string, unknown>): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record<string, unknown>).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record<string, string[]> + persistent: boolean; + keywords: Record<string, string[]>; } - > - informationalPatterns: Record<string, string[]> - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record<string, string[]> + >; + informationalPatterns: Record<string, string[]>; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record<string, string[]>; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record<string, string[]>, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record<string, string[]>, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set<string>() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set<string>(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record<string, string[]>): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record<string, string[]>, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map<string, number>() + const scores = new Map<string, number>(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record<string, string[]> = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.cursor/hooks/persistent-mode.ts b/.cursor/hooks/persistent-mode.ts index 4936f4e..311035a 100644 --- a/.cursor/hooks/persistent-mode.ts +++ b/.cursor/hooks/persistent-mode.ts @@ -13,125 +13,170 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record<string, { persistent: boolean }> + workflows: Record<string, { persistent: boolean }>; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" - if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "AfterAgent") return "gemini"; + if (event === "Stop" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +189,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.cursor/hooks/skill-injector.ts b/.cursor/hooks/skill-injector.ts index beda327..9ccce70 100644 --- a/.cursor/hooks/skill-injector.ts +++ b/.cursor/hooks/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record<string, { keywords: Record<string, string[]> }> - cjkScripts?: string[] + skills?: Record<string, { keywords: Record<string, string[]> }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType<typeof readdirSync> + const out: SkillEntry[] = []; + let entries: ReturnType<typeof readdirSync>; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set<string>() - const allTriggers: string[] = [] + const seen = new Set<string>(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record<string, { injected: string[]; timestamp: number }> + sessions: Record<string, { injected: string[]; timestamp: number }>; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.cursor/hooks/test-filter.ts b/.cursor/hooks/test-filter.ts index a0ce2fc..a3ad992 100644 --- a/.cursor/hooks/test-filter.ts +++ b/.cursor/hooks/test-filter.ts @@ -1,51 +1,61 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" - if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeTool") return "gemini"; + if (event === "PreToolUse" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +88,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record<string, unknown> = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.cursor/hooks/triggers.json b/.cursor/hooks/triggers.json index f404583..0a1513f 100644 --- a/.cursor/hooks/triggers.json +++ b/.cursor/hooks/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.cursor/hooks/types.ts b/.cursor/hooks/types.ts index f9bf420..2b79035 100644 --- a/.cursor/hooks/types.ts +++ b/.cursor/hooks/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, unknown>): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record<string, unknown>, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,7 +120,7 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); case "claude": case "codex": case "qwen": @@ -123,15 +129,15 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); } } // --- Shared Types --- export interface ModeState { - workflow: string - sessionId: string - activatedAt: string - reinforcementCount: number + workflow: string; + sessionId: string; + activatedAt: string; + reinforcementCount: number; } diff --git a/.gemini/agents/architecture-reviewer.md b/.gemini/agents/architecture-reviewer.md index f33d7c0..c1a1e85 100644 --- a/.gemini/agents/architecture-reviewer.md +++ b/.gemini/agents/architecture-reviewer.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/architecture-reviewer.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-architecture.md` (orchestrated: `result-architecture-{sessionId}.md`) - Include: status, recommendation summary, tradeoffs, risks, validation steps, artifacts created +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY recommendations or structural edits, output this block: @@ -45,6 +47,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT change architecture or code +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.gemini/agents/backend-engineer.md b/.gemini/agents/backend-engineer.md index 351caa6..1e83f03 100644 --- a/.gemini/agents/backend-engineer.md +++ b/.gemini/agents/backend-engineer.md @@ -9,7 +9,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/backend-engineer.md --> @@ -28,6 +28,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-backend.md` (orchestrated: `result-backend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -44,6 +46,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.gemini/agents/db-engineer.md b/.gemini/agents/db-engineer.md index 38d66e9..455edb1 100644 --- a/.gemini/agents/db-engineer.md +++ b/.gemini/agents/db-engineer.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/db-engineer.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-db.md` (orchestrated: `result-db-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -41,6 +43,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.gemini/agents/debug-investigator.md b/.gemini/agents/debug-investigator.md index 814bcce..18b2926 100644 --- a/.gemini/agents/debug-investigator.md +++ b/.gemini/agents/debug-investigator.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/debug-investigator.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-debug.md` (orchestrated: `result-debug-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -45,6 +47,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT write code +<!-- CHARTER_CHECK_END --> ## Diagnosis Process diff --git a/.gemini/agents/frontend-engineer.md b/.gemini/agents/frontend-engineer.md index 42d66f9..7692dac 100644 --- a/.gemini/agents/frontend-engineer.md +++ b/.gemini/agents/frontend-engineer.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/frontend-engineer.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-frontend.md` (orchestrated: `result-frontend-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -41,6 +43,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.gemini/agents/mobile-engineer.md b/.gemini/agents/mobile-engineer.md index d82dc21..24f5b2c 100644 --- a/.gemini/agents/mobile-engineer.md +++ b/.gemini/agents/mobile-engineer.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/mobile-engineer.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-mobile.md` (orchestrated: `result-mobile-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY code changes, output this block: @@ -41,6 +43,7 @@ CHARTER_CHECK: - Success criteria: {measurable criteria} - Assumptions: {defaults applied} ``` +<!-- CHARTER_CHECK_END --> ## Architecture diff --git a/.gemini/agents/pm-planner.md b/.gemini/agents/pm-planner.md index 5e8c579..e565d9a 100644 --- a/.gemini/agents/pm-planner.md +++ b/.gemini/agents/pm-planner.md @@ -7,7 +7,7 @@ tools: - grep_search - read_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/pm-planner.md --> @@ -26,6 +26,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-pm.md` (orchestrated: `result-pm-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY planning work, output this block: @@ -42,6 +44,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT proceed +<!-- CHARTER_CHECK_END --> ## Planning Process diff --git a/.gemini/agents/qa-reviewer.md b/.gemini/agents/qa-reviewer.md index 7545034..ec17b60 100644 --- a/.gemini/agents/qa-reviewer.md +++ b/.gemini/agents/qa-reviewer.md @@ -7,7 +7,7 @@ tools: - grep_search - read_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/qa-reviewer.md --> @@ -26,6 +26,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-qa.md` (orchestrated: `result-qa-{sessionId}.md`) - Include: status, summary, files changed, acceptance criteria checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before starting review, output this block: @@ -38,6 +40,7 @@ CHARTER_CHECK: - Must NOT do: modify source code, skip severity levels, report unverified findings - Success criteria: {all files reviewed, findings with file:line references} ``` +<!-- CHARTER_CHECK_END --> ## Review Priority Order diff --git a/.gemini/agents/tf-infra-engineer.md b/.gemini/agents/tf-infra-engineer.md index 6961161..302342d 100644 --- a/.gemini/agents/tf-infra-engineer.md +++ b/.gemini/agents/tf-infra-engineer.md @@ -10,7 +10,7 @@ tools: - replace - write_file - ask_user -model: gemini-3-flash-preview +model: gemini-3-flash --- <!-- Generated by oh-my-agent CLI. Source: .agents/agents/tf-infra-engineer.md --> @@ -29,6 +29,8 @@ Follow `.agents/skills/_shared/runtime/execution-protocols/gemini.md`: - Write results to project root `.agents/results/result-tf-infra.md` (orchestrated: `result-tf-infra-{sessionId}.md`) - Include: status, summary, files changed, validation results, plan/apply notes, acceptance checklist +<!-- CHARTER_CHECK_BEGIN --> + ## Charter Preflight (MANDATORY) Before ANY infrastructure changes, output this block: @@ -45,6 +47,7 @@ CHARTER_CHECK: - LOW: proceed with assumptions - MEDIUM: list options, proceed with most likely - HIGH: set status blocked, list questions, DO NOT apply destructive changes +<!-- CHARTER_CHECK_END --> ## Rules diff --git a/.gemini/hooks/hud.ts b/.gemini/hooks/hud.ts index 597b95c..73f0ac4 100644 --- a/.gemini/hooks/hud.ts +++ b/.gemini/hooks/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.gemini/hooks/keyword-detector.ts b/.gemini/hooks/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.gemini/hooks/keyword-detector.ts +++ b/.gemini/hooks/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record<string, unknown>): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record<string, unknown>).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record<string, string[]> + persistent: boolean; + keywords: Record<string, string[]>; } - > - informationalPatterns: Record<string, string[]> - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record<string, string[]> + >; + informationalPatterns: Record<string, string[]>; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record<string, string[]>; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record<string, string[]>, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record<string, string[]>, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set<string>() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set<string>(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record<string, string[]>): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record<string, string[]>, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map<string, number>() + const scores = new Map<string, number>(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record<string, string[]> = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.gemini/hooks/persistent-mode.ts b/.gemini/hooks/persistent-mode.ts index 4936f4e..311035a 100644 --- a/.gemini/hooks/persistent-mode.ts +++ b/.gemini/hooks/persistent-mode.ts @@ -13,125 +13,170 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record<string, { persistent: boolean }> + workflows: Record<string, { persistent: boolean }>; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" - if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "AfterAgent") return "gemini"; + if (event === "Stop" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +189,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.gemini/hooks/skill-injector.ts b/.gemini/hooks/skill-injector.ts index beda327..9ccce70 100644 --- a/.gemini/hooks/skill-injector.ts +++ b/.gemini/hooks/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record<string, { keywords: Record<string, string[]> }> - cjkScripts?: string[] + skills?: Record<string, { keywords: Record<string, string[]> }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType<typeof readdirSync> + const out: SkillEntry[] = []; + let entries: ReturnType<typeof readdirSync>; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set<string>() - const allTriggers: string[] = [] + const seen = new Set<string>(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record<string, { injected: string[]; timestamp: number }> + sessions: Record<string, { injected: string[]; timestamp: number }>; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.gemini/hooks/test-filter.ts b/.gemini/hooks/test-filter.ts index a0ce2fc..a3ad992 100644 --- a/.gemini/hooks/test-filter.ts +++ b/.gemini/hooks/test-filter.ts @@ -1,51 +1,61 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" - if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeTool") return "gemini"; + if (event === "PreToolUse" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +88,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record<string, unknown> = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.gemini/hooks/triggers.json b/.gemini/hooks/triggers.json index f404583..0a1513f 100644 --- a/.gemini/hooks/triggers.json +++ b/.gemini/hooks/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.gemini/hooks/types.ts b/.gemini/hooks/types.ts index f9bf420..2b79035 100644 --- a/.gemini/hooks/types.ts +++ b/.gemini/hooks/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, unknown>): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record<string, unknown>, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,7 +120,7 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); case "claude": case "codex": case "qwen": @@ -123,15 +129,15 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); } } // --- Shared Types --- export interface ModeState { - workflow: string - sessionId: string - activatedAt: string - reinforcementCount: number + workflow: string; + sessionId: string; + activatedAt: string; + reinforcementCount: number; } diff --git a/.gemini/settings.json b/.gemini/settings.json index d4ce113..59ee3bf 100644 --- a/.gemini/settings.json +++ b/.gemini/settings.json @@ -45,5 +45,16 @@ "matcher": "*" } ] + }, + "general": { + "enableNotifications": true + }, + "experimental": { + "enableAgents": true + }, + "mcpServers": { + "serena": { + "url": "http://localhost:12341/mcp" + } } } diff --git a/.qwen/hooks/hud.ts b/.qwen/hooks/hud.ts index 597b95c..73f0ac4 100644 --- a/.qwen/hooks/hud.ts +++ b/.qwen/hooks/hud.ts @@ -9,163 +9,166 @@ * stdout: ANSI-colored status text */ -import { existsSync, readdirSync, readFileSync } from "node:fs" -import { join } from "node:path" -import type { ModeState } from "./types.ts" +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { ModeState } from "./types.ts"; // ── ANSI Colors ─────────────────────────────────────────────── -const dim = (s: string) => `\x1b[2m${s}\x1b[22m` -const bold = (s: string) => `\x1b[1m${s}\x1b[22m` -const green = (s: string) => `\x1b[32m${s}\x1b[39m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[39m` -const red = (s: string) => `\x1b[31m${s}\x1b[39m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[39m` +const dim = (s: string) => `\x1b[2m${s}\x1b[22m`; +const bold = (s: string) => `\x1b[1m${s}\x1b[22m`; +const green = (s: string) => `\x1b[32m${s}\x1b[39m`; +const yellow = (s: string) => `\x1b[33m${s}\x1b[39m`; +const red = (s: string) => `\x1b[31m${s}\x1b[39m`; +const cyan = (s: string) => `\x1b[36m${s}\x1b[39m`; function colorByThreshold(value: number, text: string): string { - if (value >= 85) return red(text) - if (value >= 70) return yellow(text) - return green(text) + if (value >= 85) return red(text); + if (value >= 70) return yellow(text); + return green(text); } // ── Stdin Parsing ───────────────────────────────────────────── interface RateLimit { - used_percentage?: number - resets_at?: string + used_percentage?: number; + resets_at?: string; } interface StatuslineStdin { - cwd?: string - model?: { id?: string; display_name?: string } + cwd?: string; + model?: { id?: string; display_name?: string }; context_window?: { - context_window_size?: number - used_percentage?: number - } + context_window_size?: number; + used_percentage?: number; + }; cost?: { - total_cost_usd?: number - total_lines_added?: number - total_lines_removed?: number - total_duration_ms?: number - } + total_cost_usd?: number; + total_lines_added?: number; + total_lines_removed?: number; + total_duration_ms?: number; + }; rate_limits?: { - five_hour?: RateLimit - seven_day?: RateLimit - } + five_hour?: RateLimit; + seven_day?: RateLimit; + }; } function readStdin(): StatuslineStdin { try { - return JSON.parse(readFileSync("/dev/stdin", "utf-8")) + return JSON.parse(readFileSync("/dev/stdin", "utf-8")); } catch { - return {} + return {}; } } // ── Active Workflow Detection ───────────────────────────────── function getActiveWorkflow(projectDir: string): ModeState | null { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return null + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return null; try { for (const file of readdirSync(stateDir)) { - if (!file.endsWith(".json") || !file.includes("-state-")) continue - const content = readFileSync(join(stateDir, file), "utf-8") - const state: ModeState = JSON.parse(content) + if (!file.endsWith(".json") || !file.includes("-state-")) continue; + const content = readFileSync(join(stateDir, file), "utf-8"); + const state: ModeState = JSON.parse(content); // Skip stale (>2h) - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - if (elapsed > 2 * 60 * 60 * 1000) continue + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + if (elapsed > 2 * 60 * 60 * 1000) continue; - return state + return state; } } catch { // ignore } - return null + return null; } // ── Model Name Shortener ────────────────────────────────────── function shortModel(model?: { id?: string; display_name?: string }): string { - const name = model?.display_name || model?.id || "" - if (!name) return "" + const name = model?.display_name || model?.id || ""; + if (!name) return ""; // "Claude Opus 4.6 (1M context)" → "Opus 4.6" - const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i) - if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}` - return name.split("/").pop()?.slice(0, 15) || "" + const match = name.match(/(Opus|Sonnet|Haiku)[\s.]*([\d.]*)/i); + if (match) return `${match[1]}${match[2] ? ` ${match[2]}` : ""}`; + return name.split("/").pop()?.slice(0, 15) || ""; } // ── Rate Limit Helpers ─────────────────────────────────────── function formatCountdown(resetsAt: string): string { - const remaining = new Date(resetsAt).getTime() - Date.now() - if (remaining <= 0) return "" - const h = Math.floor(remaining / 3_600_000) - const m = Math.floor((remaining % 3_600_000) / 60_000) - return h > 0 ? `${h}h${m}m` : `${m}m` + const remaining = new Date(resetsAt).getTime() - Date.now(); + if (remaining <= 0) return ""; + const h = Math.floor(remaining / 3_600_000); + const m = Math.floor((remaining % 3_600_000) / 60_000); + return h > 0 ? `${h}h${m}m` : `${m}m`; } function formatRateLimit(label: string, rl?: RateLimit): string | null { - if (!rl || rl.used_percentage == null) return null - const pct = Math.round(rl.used_percentage) - const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : "" - const text = countdown ? `${label}:${pct}%(${countdown})` : `${label}:${pct}%` - return colorByThreshold(pct, text) + if (!rl || rl.used_percentage == null) return null; + const pct = Math.round(rl.used_percentage); + const countdown = rl.resets_at ? formatCountdown(rl.resets_at) : ""; + const text = countdown + ? `${label}:${pct}%(${countdown})` + : `${label}:${pct}%`; + return colorByThreshold(pct, text); } // ── Main ────────────────────────────────────────────────────── function main() { - const input = readStdin() - const projectDir = process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd() - const parts: string[] = [] + const input = readStdin(); + const projectDir = + process.env.CLAUDE_PROJECT_DIR || input.cwd || process.cwd(); + const parts: string[] = []; // 1. OMA label - parts.push(bold(cyan("[OMA]"))) + parts.push(bold(cyan("[OMA]"))); // 2. Model - const model = shortModel(input.model) - if (model) parts.push(dim(model)) + const model = shortModel(input.model); + if (model) parts.push(dim(model)); // 3. Context % - const ctxPct = input.context_window?.used_percentage + const ctxPct = input.context_window?.used_percentage; if (ctxPct != null) { - parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)) + parts.push(colorByThreshold(ctxPct, `ctx:${Math.round(ctxPct)}%`)); } // 4. Session cost - const cost = input.cost?.total_cost_usd + const cost = input.cost?.total_cost_usd; if (cost != null && cost > 0) { - parts.push(dim(`$${cost.toFixed(2)}`)) + parts.push(dim(`$${cost.toFixed(2)}`)); } // 5. Rate limits (5h / 7d) - const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour) - const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day) + const rl5 = formatRateLimit("5h", input.rate_limits?.five_hour); + const rl7 = formatRateLimit("7d", input.rate_limits?.seven_day); if (rl5 || rl7) { - parts.push([rl5, rl7].filter(Boolean).join(dim(" "))) + parts.push([rl5, rl7].filter(Boolean).join(dim(" "))); } // 6. Lines changed - const added = input.cost?.total_lines_added - const removed = input.cost?.total_lines_removed + const added = input.cost?.total_lines_added; + const removed = input.cost?.total_lines_removed; if (added || removed) { - const diffParts: string[] = [] - if (added) diffParts.push(green(`+${added}`)) - if (removed) diffParts.push(red(`-${removed}`)) - parts.push(diffParts.join(dim("/"))) + const diffParts: string[] = []; + if (added) diffParts.push(green(`+${added}`)); + if (removed) diffParts.push(red(`-${removed}`)); + parts.push(diffParts.join(dim("/"))); } // 7. Active workflow - const workflow = getActiveWorkflow(projectDir) + const workflow = getActiveWorkflow(projectDir); if (workflow) { - const label = `${workflow.workflow}:${workflow.reinforcementCount}` - parts.push(yellow(label)) + const label = `${workflow.workflow}:${workflow.reinforcementCount}`; + parts.push(yellow(label)); } - process.stdout.write(parts.join(dim(" │ "))) + process.stdout.write(parts.join(dim(" │ "))); } -main() +main(); diff --git a/.qwen/hooks/keyword-detector.ts b/.qwen/hooks/keyword-detector.ts index 0ce0d0e..e838a0a 100644 --- a/.qwen/hooks/keyword-detector.ts +++ b/.qwen/hooks/keyword-detector.ts @@ -12,59 +12,205 @@ * exit 0 = always (allow) */ -import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { type ModeState, makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + type ModeState, + makePromptOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +// ── Guard 1: UserPromptSubmit-only trigger ──────────────────── +// Hook event names that represent genuine user input (not agent responses) +const VALID_USER_EVENTS = new Set([ + "UserPromptSubmit", + "beforeSubmitPrompt", // Cursor + "BeforeAgent", // Gemini (fires before agent processes user prompt) +]); + +/** + * Returns true if the hook input indicates this is a genuine user prompt, + * not an agent-generated response. Prevents re-trigger loops. + */ +export function isGenuineUserPrompt(input: Record<string, unknown>): boolean { + const event = input.hook_event_name as string | undefined; + // If event is explicitly provided, validate it + if (event !== undefined) { + return VALID_USER_EVENTS.has(event); + } + // No event field — assume genuine (backward compat with vendors that omit it) + return true; +} + +// ── Guard 3: Reinforcement suppression ─────────────────────── + +const REINFORCEMENT_WINDOW_MS = 60_000; // 60 seconds +const REINFORCEMENT_MAX_COUNT = 2; // allow up to 2, suppress 3rd+ + +export interface KeywordDetectorState { + triggers: Record< + string, + { + lastTriggeredAt: string; // ISO timestamp + count: number; + } + >; +} + +function getKwStateFilePath(projectDir: string): string { + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return join(dir, "keyword-detector-state.json"); +} + +/** + * Load the keyword-detector reinforcement state from disk. + * Resets gracefully if the file is missing or corrupt. + */ +export function loadKwState(projectDir: string): KeywordDetectorState { + const filePath = getKwStateFilePath(projectDir); + if (!existsSync(filePath)) return { triggers: {} }; + try { + const raw = readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === "object" && + parsed !== null && + "triggers" in parsed && + typeof (parsed as Record<string, unknown>).triggers === "object" + ) { + return parsed as KeywordDetectorState; + } + return { triggers: {} }; + } catch { + // Corrupt file — reset + return { triggers: {} }; + } +} + +/** + * Save reinforcement state to disk. + */ +export function saveKwState( + projectDir: string, + state: KeywordDetectorState, +): void { + try { + const filePath = getKwStateFilePath(projectDir); + writeFileSync(filePath, JSON.stringify(state, null, 2)); + } catch { + // Non-fatal — reinforcement suppression is best-effort + } +} + +/** + * Returns true if the keyword should be suppressed due to reinforcement loop. + * A keyword is suppressed if it was triggered >= REINFORCEMENT_MAX_COUNT times + * within the last REINFORCEMENT_WINDOW_MS milliseconds. + */ +export function isReinforcementSuppressed( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): boolean { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + if (!entry) return false; + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + if (Number.isNaN(lastMs)) return false; + const withinWindow = now - lastMs < REINFORCEMENT_WINDOW_MS; + return withinWindow && entry.count >= REINFORCEMENT_MAX_COUNT; +} + +/** + * Record a keyword trigger in the reinforcement state. + * Resets count if the previous trigger was outside the window. + */ +export function recordKwTrigger( + state: KeywordDetectorState, + keyword: string, + nowMs?: number, +): KeywordDetectorState { + const now = nowMs ?? Date.now(); + const entry = state.triggers[keyword]; + let count = 1; + if (entry) { + const lastMs = new Date(entry.lastTriggeredAt).getTime(); + const withinWindow = + !Number.isNaN(lastMs) && now - lastMs < REINFORCEMENT_WINDOW_MS; + count = withinWindow ? entry.count + 1 : 1; + } + return { + ...state, + triggers: { + ...state.triggers, + [keyword]: { + lastTriggeredAt: new Date(now).toISOString(), + count, + }, + }, + }; +} // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { // Codex uses snake_case session_id, Claude uses camelCase sessionId - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } // Qwen Code sets QWEN_PROJECT_DIR; Claude sets CLAUDE_PROJECT_DIR - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── @@ -73,74 +219,83 @@ interface TriggerConfig { workflows: Record< string, { - persistent: boolean - keywords: Record<string, string[]> + persistent: boolean; + keywords: Record<string, string[]>; } - > - informationalPatterns: Record<string, string[]> - excludedWorkflows: string[] - cjkScripts: string[] - extensionRouting?: Record<string, string[]> + >; + informationalPatterns: Record<string, string[]>; + excludedWorkflows: string[]; + cjkScripts: string[]; + extensionRouting?: Record<string, string[]>; } function loadConfig(): TriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - return JSON.parse(readFileSync(configPath, "utf-8")) + const configPath = join(dirname(import.meta.path), "triggers.json"); + return JSON.parse(readFileSync(configPath, "utf-8")); } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Builder ─────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildPatterns(keywords: Record<string, string[]>, lang: string, cjkScripts: string[]): RegExp[] { +export function buildPatterns( + keywords: Record<string, string[]>, + lang: string, + cjkScripts: string[], +): RegExp[] { const allKeywords = [ ...(keywords["*"] ?? []), ...(keywords.en ?? []), ...(lang !== "en" ? (keywords[lang] ?? []) : []), - ] + ]; return allKeywords.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } -function buildInformationalPatterns(config: TriggerConfig, lang: string): RegExp[] { - const patterns = [...(config.informationalPatterns.en ?? [])] +function buildInformationalPatterns( + config: TriggerConfig, + lang: string, +): RegExp[] { + const patterns = [...(config.informationalPatterns.en ?? [])]; if (lang !== "en") { - patterns.push(...(config.informationalPatterns[lang] ?? [])) + patterns.push(...(config.informationalPatterns[lang] ?? [])); } return patterns.map((p) => { - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (/[^\x00-\x7F]/.test(p)) return new RegExp(escapeRegex(p), "i") - return new RegExp(`\\b${escapeRegex(p)}\\b`, "i") - }) + if (/[^\p{ASCII}]/u.test(p)) return new RegExp(escapeRegex(p), "i"); + return new RegExp(`\\b${escapeRegex(p)}\\b`, "i"); + }); } // ── Filters ─────────────────────────────────────────────────── -export function isInformationalContext(prompt: string, matchIndex: number, infoPatterns: RegExp[]): boolean { - const windowStart = Math.max(0, matchIndex - 60) - const window = prompt.slice(windowStart, matchIndex + 60) - return infoPatterns.some((p) => p.test(window)) +export function isInformationalContext( + prompt: string, + matchIndex: number, + infoPatterns: RegExp[], +): boolean { + const windowStart = Math.max(0, matchIndex - 60); + const window = prompt.slice(windowStart, matchIndex + 60); + return infoPatterns.some((p) => p.test(window)); } /** @@ -148,12 +303,16 @@ export function isInformationalContext(prompt: string, matchIndex: number, infoP * only match keywords in the first N chars of the user's prompt. * Keywords deep in the prompt are likely from pasted content, not user intent. */ -const PERSISTENT_MATCH_LIMIT = 200 - -export function isPastedContent(matchIndex: number, isPersistent: boolean, promptLength: number): boolean { - if (!isPersistent) return false - if (promptLength <= PERSISTENT_MATCH_LIMIT) return false - return matchIndex > PERSISTENT_MATCH_LIMIT +const PERSISTENT_MATCH_LIMIT = 200; + +export function isPastedContent( + matchIndex: number, + isPersistent: boolean, + promptLength: number, +): boolean { + if (!isPersistent) return false; + if (promptLength <= PERSISTENT_MATCH_LIMIT) return false; + return matchIndex > PERSISTENT_MATCH_LIMIT; } /** @@ -180,11 +339,11 @@ const QUESTION_PATTERNS: RegExp[] = [ /^.*\banything worth\b/i, /^.*\bwhat.*(feature|difference|reference)/i, /^.*\bcompare\b/i, -] +]; export function isAnalyticalQuestion(prompt: string): boolean { - const firstLine = prompt.split("\n")[0].trim() - return QUESTION_PATTERNS.some((p) => p.test(firstLine)) + const firstLine = prompt.split("\n")[0].trim(); + return QUESTION_PATTERNS.some((p) => p.test(firstLine)); } export function stripCodeBlocks(text: string): string { @@ -193,11 +352,11 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") // unclosed fenced blocks (strip to end) .replace(/`{3,}[^`]*`{3,}/g, "") // single-line fenced blocks (```...```) .replace(/`[^`\n]+`/g, "") // inline code (no newlines allowed) - .replace(/"[^"\n]*"/g, "") // quoted strings + .replace(/"[^"\n]*"/g, ""); // quoted strings } export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } // ── Extension Detection ────────────────────────────────────── @@ -228,62 +387,70 @@ const EXCLUDE_EXTS = new Set([ "eot", "map", "d", -]) +]); export function detectExtensions(prompt: string): string[] { - const extPattern = /\.([a-zA-Z]{1,12})\b/g - const extensions = new Set<string>() - let match: RegExpExecArray | null - // biome-ignore lint/suspicious/noAssignInExpressions: standard regex.exec loop pattern - while ((match = extPattern.exec(prompt)) !== null) { - const ext = match[1].toLowerCase() + const extPattern = /\.([a-zA-Z]{1,12})\b/g; + const extensions = new Set<string>(); + for (const match of prompt.matchAll(extPattern)) { + const ext = match[1].toLowerCase(); if (!EXCLUDE_EXTS.has(ext)) { - extensions.add(ext) + extensions.add(ext); } } - return [...extensions] + return [...extensions]; } -export function resolveAgentFromExtensions(extensions: string[], routing: Record<string, string[]>): string | null { - if (extensions.length === 0) return null +export function resolveAgentFromExtensions( + extensions: string[], + routing: Record<string, string[]>, +): string | null { + if (extensions.length === 0) return null; - const scores = new Map<string, number>() + const scores = new Map<string, number>(); for (const ext of extensions) { for (const [agent, agentExts] of Object.entries(routing)) { if (agentExts.includes(ext)) { - scores.set(agent, (scores.get(agent) ?? 0) + 1) + scores.set(agent, (scores.get(agent) ?? 0) + 1); } } } - if (scores.size === 0) return null + if (scores.size === 0) return null; - let best: string | null = null - let bestScore = 0 + let best: string | null = null; + let bestScore = 0; for (const [agent, score] of scores) { if (score > bestScore) { - bestScore = score - best = agent + bestScore = score; + best = agent; } } - return best + return best; } // ── State Management ────────────────────────────────────────── function getStateDir(projectDir: string): string { - const dir = join(projectDir, ".agents", "state") - if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) - return dir + const dir = join(projectDir, ".agents", "state"); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); + return dir; } -function activateMode(projectDir: string, workflow: string, sessionId: string): void { +function activateMode( + projectDir: string, + workflow: string, + sessionId: string, +): void { const state: ModeState = { workflow, sessionId, activatedAt: new Date().toISOString(), reinforcementCount: 0, - } - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) + }; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Deactivation Detection ─────────────────────────────────── @@ -300,27 +467,33 @@ export const DEACTIVATION_PHRASES: Record<string, string[]> = { ru: ["воркфлоу завершён", "рабочий процесс завершён"], nl: ["workflow voltooid", "workflow klaar"], pl: ["workflow zakończony", "workflow ukończony"], -} +}; export function isDeactivationRequest(prompt: string, lang: string): boolean { - const phrases = [...(DEACTIVATION_PHRASES.en ?? []), ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : [])] - const lower = prompt.toLowerCase() - return phrases.some((phrase) => lower.includes(phrase.toLowerCase())) + const phrases = [ + ...(DEACTIVATION_PHRASES.en ?? []), + ...(lang !== "en" ? (DEACTIVATION_PHRASES[lang] ?? []) : []), + ]; + const lower = prompt.toLowerCase(); + return phrases.some((phrase) => lower.includes(phrase.toLowerCase())); } -export function deactivateAllPersistentModes(projectDir: string, sessionId?: string): void { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return +export function deactivateAllPersistentModes( + projectDir: string, + sessionId?: string, +): void { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return; try { - const files = readdirSync(stateDir) + const files = readdirSync(stateDir); for (const file of files) { // Match session-scoped state files: {workflow}-state-{sessionId}.json if (sessionId) { if (file.endsWith(`-state-${sessionId}.json`)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } else if (/-state-/.test(file) && file.endsWith(".json")) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { @@ -331,55 +504,69 @@ export function deactivateAllPersistentModes(projectDir: string, sessionId?: str // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" + // Guard 1: Only process genuine user prompts — skip agent-generated content + if (!isGenuineUserPrompt(input)) process.exit(0); + + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); - const config = loadConfig() - const lang = detectLanguage(projectDir) + const config = loadConfig(); + const lang = detectLanguage(projectDir); // Check for deactivation request before workflow detection if (isDeactivationRequest(prompt, lang)) { - deactivateAllPersistentModes(projectDir, sessionId) - process.exit(0) + deactivateAllPersistentModes(projectDir, sessionId); + process.exit(0); } - const infoPatterns = buildInformationalPatterns(config, lang) - const cleaned = stripCodeBlocks(prompt) - const excluded = new Set(config.excludedWorkflows) + const infoPatterns = buildInformationalPatterns(config, lang); + // Guard 2: Strip code blocks and inline code before scanning for keywords + const cleaned = stripCodeBlocks(prompt); + const excluded = new Set(config.excludedWorkflows); + + // Guard 3: Load reinforcement suppression state + const kwState = loadKwState(projectDir); // Skip persistent workflows entirely if the prompt is an analytical question - const analytical = isAnalyticalQuestion(cleaned) + const analytical = isAnalyticalQuestion(cleaned); for (const [workflow, def] of Object.entries(config.workflows)) { - if (excluded.has(workflow)) continue + if (excluded.has(workflow)) continue; // Analytical questions should never trigger persistent workflows - if (analytical && def.persistent) continue + if (analytical && def.persistent) continue; - const patterns = buildPatterns(def.keywords, lang, config.cjkScripts) + const patterns = buildPatterns(def.keywords, lang, config.cjkScripts); for (const pattern of patterns) { - const match = pattern.exec(cleaned) - if (!match) continue - if (isInformationalContext(cleaned, match.index, infoPatterns)) continue + const match = pattern.exec(cleaned); + if (!match) continue; + if (isInformationalContext(cleaned, match.index, infoPatterns)) continue; // Keywords deep in long prompts are likely pasted content, not user intent - if (isPastedContent(match.index, def.persistent, cleaned.length)) continue + if (isPastedContent(match.index, def.persistent, cleaned.length)) + continue; + + // Guard 3: Suppress if same workflow triggered too many times in 60s + if (isReinforcementSuppressed(kwState, workflow)) continue; if (def.persistent) { - activateMode(projectDir, workflow, sessionId) + activateMode(projectDir, workflow, sessionId); } + // Record this trigger for reinforcement tracking + const updatedState = recordKwTrigger(kwState, workflow); + saveKwState(projectDir, updatedState); const contextLines = [ `[OMA WORKFLOW: ${workflow.toUpperCase()}]`, @@ -387,26 +574,29 @@ async function main() { `Read and follow \`.agents/workflows/${workflow}.md\` step by step.`, `User request: ${prompt}`, `IMPORTANT: Start the workflow IMMEDIATELY. Do not ask for confirmation.`, - ] + ]; if (config.extensionRouting) { - const extensions = detectExtensions(prompt) - const agent = resolveAgentFromExtensions(extensions, config.extensionRouting) + const extensions = detectExtensions(prompt); + const agent = resolveAgentFromExtensions( + extensions, + config.extensionRouting, + ); if (agent) { - contextLines.push(`[OMA AGENT HINT: ${agent}]`) + contextLines.push(`[OMA AGENT HINT: ${agent}]`); } } - const context = contextLines.join("\n") + const context = contextLines.join("\n"); - process.stdout.write(makePromptOutput(vendor, context)) - process.exit(0) + process.stdout.write(makePromptOutput(vendor, context)); + process.exit(0); } } - process.exit(0) + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.qwen/hooks/persistent-mode.ts b/.qwen/hooks/persistent-mode.ts index 4936f4e..311035a 100644 --- a/.qwen/hooks/persistent-mode.ts +++ b/.qwen/hooks/persistent-mode.ts @@ -13,125 +13,170 @@ * exit 2 = block stop */ -import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs" -import { dirname, join } from "node:path" -import { isDeactivationRequest } from "./keyword-detector.ts" -import { type ModeState, makeBlockOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_REINFORCEMENTS = 5 -const STALE_HOURS = 2 +import { + existsSync, + readdirSync, + readFileSync, + unlinkSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { isDeactivationRequest } from "./keyword-detector.ts"; +import { + type ModeState, + makeBlockOutput, + resolveGitRoot, + type Vendor, +} from "./types.ts"; + +const MAX_REINFORCEMENTS = 5; +const STALE_HOURS = 2; function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Config Loading ──────────────────────────────────────────── interface TriggerConfig { - workflows: Record<string, { persistent: boolean }> + workflows: Record<string, { persistent: boolean }>; } function loadPersistentWorkflows(): string[] { - const configPath = join(dirname(import.meta.path), "triggers.json") + const configPath = join(dirname(import.meta.path), "triggers.json"); try { - const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")) + const config: TriggerConfig = JSON.parse(readFileSync(configPath, "utf-8")); return Object.entries(config.workflows) .filter(([, def]) => def.persistent) - .map(([name]) => name) + .map(([name]) => name); } catch { - return ["ultrawork", "orchestrate", "work"] + return ["ultrawork", "orchestrate", "work"]; } } // ── Vendor Detection ────────────────────────────────────────── +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "AfterAgent") return "gemini" - if (event === "Stop") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "AfterAgent") return "gemini"; + if (event === "Stop" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── State ───────────────────────────────────────────────────── function getStateDir(projectDir: string): string { - return join(projectDir, ".agents", "state") + return join(projectDir, ".agents", "state"); } -function readModeState(projectDir: string, workflow: string, sessionId: string): ModeState | null { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (!existsSync(path)) return null +function readModeState( + projectDir: string, + workflow: string, + sessionId: string, +): ModeState | null { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (!existsSync(path)) return null; try { - return JSON.parse(readFileSync(path, "utf-8")) as ModeState + return JSON.parse(readFileSync(path, "utf-8")) as ModeState; } catch { - return null + return null; } } export function isStale(state: ModeState): boolean { - const elapsed = Date.now() - new Date(state.activatedAt).getTime() - return elapsed > STALE_HOURS * 60 * 60 * 1000 + const elapsed = Date.now() - new Date(state.activatedAt).getTime(); + return elapsed > STALE_HOURS * 60 * 60 * 1000; } -export function deactivate(projectDir: string, workflow: string, sessionId: string): void { - const path = join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`) - if (existsSync(path)) unlinkSync(path) +export function deactivate( + projectDir: string, + workflow: string, + sessionId: string, +): void { + const path = join( + getStateDir(projectDir), + `${workflow}-state-${sessionId}.json`, + ); + if (existsSync(path)) unlinkSync(path); } -function incrementReinforcement(projectDir: string, workflow: string, sessionId: string, state: ModeState): void { - state.reinforcementCount += 1 - writeFileSync(join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), JSON.stringify(state, null, 2)) +function incrementReinforcement( + projectDir: string, + workflow: string, + sessionId: string, + state: ModeState, +): void { + state.reinforcementCount += 1; + writeFileSync( + join(getStateDir(projectDir), `${workflow}-state-${sessionId}.json`), + JSON.stringify(state, null, 2), + ); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const lang = detectLanguage(projectDir) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const lang = detectLanguage(projectDir); // Check all text fields in stdin for deactivation phrases. // The assistant may have included "workflow done" in its response, @@ -144,60 +189,60 @@ async function main() { input.transcript, ] .filter((v): v is string => typeof v === "string") - .join(" ") + .join(" "); if (textToCheck && isDeactivationRequest(textToCheck, lang)) { // Deactivate all persistent workflows for this session - const stateDir = join(projectDir, ".agents", "state") + const stateDir = join(projectDir, ".agents", "state"); if (existsSync(stateDir)) { try { - const suffix = `-state-${sessionId}.json` + const suffix = `-state-${sessionId}.json`; for (const file of readdirSync(stateDir)) { if (file.endsWith(suffix)) { - unlinkSync(join(stateDir, file)) + unlinkSync(join(stateDir, file)); } } } catch { /* ignore */ } } - process.exit(0) + process.exit(0); } - const persistentWorkflows = loadPersistentWorkflows() + const persistentWorkflows = loadPersistentWorkflows(); for (const workflow of persistentWorkflows) { - const state = readModeState(projectDir, workflow, sessionId) - if (!state) continue + const state = readModeState(projectDir, workflow, sessionId); + if (!state) continue; if (isStale(state) || state.reinforcementCount >= MAX_REINFORCEMENTS) { - deactivate(projectDir, workflow, sessionId) - continue + deactivate(projectDir, workflow, sessionId); + continue; } - incrementReinforcement(projectDir, workflow, sessionId, state) + incrementReinforcement(projectDir, workflow, sessionId, state); - const stateFile = `.agents/state/${workflow}-state-${sessionId}.json` + const stateFile = `.agents/state/${workflow}-state-${sessionId}.json`; const reason = [ `[OMA PERSISTENT MODE: ${workflow.toUpperCase()}]`, `The /${workflow} workflow is still active (reinforcement ${state.reinforcementCount}/${MAX_REINFORCEMENTS}).`, `Continue executing the workflow. If all tasks are genuinely complete:`, ` 1. Delete the state file: Bash \`rm ${stateFile}\``, ` 2. Or ask the user to say "워크플로우 완료" / "workflow done"`, - ].join("\n") + ].join("\n"); - writeBlockAndExit(vendor, reason) + writeBlockAndExit(vendor, reason); } - process.exit(0) + process.exit(0); } export function writeBlockAndExit(vendor: Vendor, reason: string): never { - process.stderr.write(reason) - process.stdout.write(makeBlockOutput(vendor, reason)) - process.exit(2) + process.stderr.write(reason); + process.stdout.write(makeBlockOutput(vendor, reason)); + process.exit(2); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } diff --git a/.qwen/hooks/skill-injector.ts b/.qwen/hooks/skill-injector.ts index beda327..9ccce70 100644 --- a/.qwen/hooks/skill-injector.ts +++ b/.qwen/hooks/skill-injector.ts @@ -12,152 +12,163 @@ * persistent workflow is active (those modes own the session context). */ -import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs" -import { basename, dirname, join } from "node:path" -import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts" - -const MAX_SKILLS = 3 -const SESSION_TTL_MS = 60 * 60 * 1000 -const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"] +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { basename, dirname, join } from "node:path"; +import { makePromptOutput, resolveGitRoot, type Vendor } from "./types.ts"; + +const MAX_SKILLS = 3; +const SESSION_TTL_MS = 60 * 60 * 1000; +const DEFAULT_CJK_SCRIPTS = ["ko", "ja", "zh"]; // ── Vendor Detection ────────────────────────────────────────── function inferVendorFromScriptPath(): Vendor | null { - const path = import.meta.path - if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor" - if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen" - if (path.includes(`${join(".claude", "hooks")}`)) return "claude" - if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini" - if (path.includes(`${join(".codex", "hooks")}`)) return "codex" - return null + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; } function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - const byScriptPath = inferVendorFromScriptPath() - if (byScriptPath) return byScriptPath - if (event === "BeforeAgent") return "gemini" - if (event === "beforeSubmitPrompt") return "cursor" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeAgent") return "gemini"; + if (event === "beforeSubmitPrompt") return "cursor"; if (event === "UserPromptSubmit") { - if ("session_id" in input && !("sessionId" in input)) return "codex" + if ("session_id" in input && !("sessionId" in input)) return "codex"; } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": case "cursor": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getSessionId(input: Record<string, unknown>): string { - return (input.sessionId as string) || (input.session_id as string) || "unknown" + return ( + (input.sessionId as string) || (input.session_id as string) || "unknown" + ); } // ── Config Loading ──────────────────────────────────────────── interface SkillsTriggerConfig { - skills?: Record<string, { keywords: Record<string, string[]> }> - cjkScripts?: string[] + skills?: Record<string, { keywords: Record<string, string[]> }>; + cjkScripts?: string[]; } function loadTriggersConfig(): SkillsTriggerConfig { - const configPath = join(dirname(import.meta.path), "triggers.json") - if (!existsSync(configPath)) return {} + const configPath = join(dirname(import.meta.path), "triggers.json"); + if (!existsSync(configPath)) return {}; try { - return JSON.parse(readFileSync(configPath, "utf-8")) + return JSON.parse(readFileSync(configPath, "utf-8")); } catch { - return {} + return {}; } } function detectLanguage(projectDir: string): string { - const prefsPath = join(projectDir, ".agents", "oma-config.yaml") - if (!existsSync(prefsPath)) return "en" + const prefsPath = join(projectDir, ".agents", "oma-config.yaml"); + if (!existsSync(prefsPath)) return "en"; try { - const content = readFileSync(prefsPath, "utf-8") - const match = content.match(/^language:\s*(\S+)/m) - return match?.[1] ?? "en" + const content = readFileSync(prefsPath, "utf-8"); + const match = content.match(/^language:\s*(\S+)/m); + return match?.[1] ?? "en"; } catch { - return "en" + return "en"; } } // ── Pattern Building ────────────────────────────────────────── export function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } -export function buildTriggerPatterns(triggers: string[], lang: string, cjkScripts: string[]): RegExp[] { +export function buildTriggerPatterns( + triggers: string[], + lang: string, + cjkScripts: string[], +): RegExp[] { return triggers.map((kw) => { - const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+") - // biome-ignore lint/suspicious/noControlCharactersInRegex: ASCII range check for non-ASCII keywords - if (cjkScripts.includes(lang) || /[^\x00-\x7F]/.test(kw)) { - return new RegExp(escaped, "i") + const escaped = escapeRegex(kw).replace(/\s+/g, "\\s+"); + if (cjkScripts.includes(lang) || /[^\p{ASCII}]/u.test(kw)) { + return new RegExp(escaped, "i"); } - return new RegExp(`\\b${escaped}\\b`, "i") - }) + return new RegExp(`\\b${escaped}\\b`, "i"); + }); } // ── Skill Discovery ─────────────────────────────────────────── export interface SkillEntry { - name: string - absolutePath: string - relPath: string + name: string; + absolutePath: string; + relPath: string; } export function discoverSkills(projectDir: string): SkillEntry[] { - const skillsDir = join(projectDir, ".agents", "skills") - if (!existsSync(skillsDir)) return [] + const skillsDir = join(projectDir, ".agents", "skills"); + if (!existsSync(skillsDir)) return []; - const out: SkillEntry[] = [] - let entries: ReturnType<typeof readdirSync> + const out: SkillEntry[] = []; + let entries: ReturnType<typeof readdirSync>; try { - entries = readdirSync(skillsDir, { withFileTypes: true }) + entries = readdirSync(skillsDir, { withFileTypes: true }); } catch { - return out + return out; } for (const entry of entries) { - if (!entry.isDirectory()) continue - if (entry.name.startsWith("_")) continue + if (!entry.isDirectory()) continue; + if (entry.name.startsWith("_")) continue; - const skillPath = join(skillsDir, entry.name, "SKILL.md") - if (!existsSync(skillPath)) continue + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!existsSync(skillPath)) continue; out.push({ name: entry.name, absolutePath: skillPath, relPath: join(".agents", "skills", entry.name, "SKILL.md"), - }) + }); } - return out + return out; } // ── Matching ────────────────────────────────────────────────── export interface SkillMatch { - name: string - relPath: string - score: number - matchedTriggers: string[] + name: string; + relPath: string; + score: number; + matchedTriggers: string[]; } export function matchSkills( @@ -166,37 +177,37 @@ export function matchSkills( skills: SkillEntry[], config: SkillsTriggerConfig, ): SkillMatch[] { - const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS - const matches: SkillMatch[] = [] + const cjkScripts = config.cjkScripts ?? DEFAULT_CJK_SCRIPTS; + const matches: SkillMatch[] = []; for (const skill of skills) { - const jsonEntry = config.skills?.[skill.name] - if (!jsonEntry) continue + const jsonEntry = config.skills?.[skill.name]; + if (!jsonEntry) continue; const jsonTriggers = [ ...(jsonEntry.keywords["*"] ?? []), ...(jsonEntry.keywords.en ?? []), ...(lang !== "en" ? (jsonEntry.keywords[lang] ?? []) : []), - ] + ]; - const seen = new Set<string>() - const allTriggers: string[] = [] + const seen = new Set<string>(); + const allTriggers: string[] = []; for (const t of jsonTriggers) { - const key = t.toLowerCase() - if (seen.has(key)) continue - seen.add(key) - allTriggers.push(t) + const key = t.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + allTriggers.push(t); } - if (allTriggers.length === 0) continue + if (allTriggers.length === 0) continue; - const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts) - const matched: string[] = [] - let score = 0 + const patterns = buildTriggerPatterns(allTriggers, lang, cjkScripts); + const matched: string[] = []; + let score = 0; for (let i = 0; i < patterns.length; i++) { if (patterns[i].test(prompt)) { - matched.push(allTriggers[i]) - score += 10 + matched.push(allTriggers[i]); + score += 10; } } @@ -206,43 +217,45 @@ export function matchSkills( relPath: skill.relPath, score, matchedTriggers: matched, - }) + }); } } - matches.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name))) - return matches.slice(0, MAX_SKILLS) + matches.sort((a, b) => + b.score !== a.score ? b.score - a.score : a.name.localeCompare(b.name), + ); + return matches.slice(0, MAX_SKILLS); } // ── Session Dedup State ─────────────────────────────────────── interface SessionState { - sessions: Record<string, { injected: string[]; timestamp: number }> + sessions: Record<string, { injected: string[]; timestamp: number }>; } function getStatePath(projectDir: string): string { - return join(projectDir, ".agents", "state", "skill-sessions.json") + return join(projectDir, ".agents", "state", "skill-sessions.json"); } function readState(projectDir: string): SessionState { - const p = getStatePath(projectDir) - if (!existsSync(p)) return { sessions: {} } + const p = getStatePath(projectDir); + if (!existsSync(p)) return { sessions: {} }; try { - const parsed = JSON.parse(readFileSync(p, "utf-8")) + const parsed = JSON.parse(readFileSync(p, "utf-8")); if (parsed && typeof parsed === "object" && parsed.sessions) { - return parsed as SessionState + return parsed as SessionState; } } catch { // corrupted — reset } - return { sessions: {} } + return { sessions: {} }; } function writeState(projectDir: string, state: SessionState): void { - const p = getStatePath(projectDir) + const p = getStatePath(projectDir); try { - mkdirSync(dirname(p), { recursive: true }) - writeFileSync(p, JSON.stringify(state, null, 2)) + mkdirSync(dirname(p), { recursive: true }); + writeFileSync(p, JSON.stringify(state, null, 2)); } catch { // dedup failing open is acceptable } @@ -254,47 +267,57 @@ export function filterFreshMatches( sessionId: string, now: number = Date.now(), ): { fresh: SkillMatch[]; nextState: SessionState } { - const state = readState(projectDir) + const state = readState(projectDir); for (const [id, sess] of Object.entries(state.sessions)) { if (now - sess.timestamp > SESSION_TTL_MS) { - delete state.sessions[id] + delete state.sessions[id]; } } - const current = state.sessions[sessionId] - const alreadyInjected = new Set(current && now - current.timestamp <= SESSION_TTL_MS ? current.injected : []) + const current = state.sessions[sessionId]; + const alreadyInjected = new Set( + current && now - current.timestamp <= SESSION_TTL_MS + ? current.injected + : [], + ); - const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)) + const fresh = matches.filter((m) => !alreadyInjected.has(m.relPath)); if (fresh.length > 0) { - const existing = state.sessions[sessionId]?.injected ?? [] + const existing = state.sessions[sessionId]?.injected ?? []; state.sessions[sessionId] = { injected: [...new Set([...existing, ...fresh.map((m) => m.relPath)])], timestamp: now, - } + }; } - return { fresh, nextState: state } + return { fresh, nextState: state }; } // ── Workflow Guard ──────────────────────────────────────────── -export function isPersistentWorkflowActive(projectDir: string, sessionId: string): boolean { - const stateDir = join(projectDir, ".agents", "state") - if (!existsSync(stateDir)) return false +export function isPersistentWorkflowActive( + projectDir: string, + sessionId: string, +): boolean { + const stateDir = join(projectDir, ".agents", "state"); + if (!existsSync(stateDir)) return false; try { - const files = readdirSync(stateDir) - return files.some((f) => f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json") + const files = readdirSync(stateDir); + return files.some( + (f) => + f.endsWith(`-state-${sessionId}.json`) && f !== "skill-sessions.json", + ); } catch { - return false + return false; } } // ── Prompt Sanitation ───────────────────────────────────────── export function startsWithSlashCommand(prompt: string): boolean { - return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()) + return /^\/[a-zA-Z][\w-]*/.test(prompt.trim()); } export function stripCodeBlocks(text: string): string { @@ -303,7 +326,7 @@ export function stripCodeBlocks(text: string): string { .replace(/(`{3,})[^\n]*\n[\s\S]*/g, "") .replace(/`{3,}[^`]*`{3,}/g, "") .replace(/`[^`\n]+`/g, "") - .replace(/"[^"\n]*"/g, "") + .replace(/"[^"\n]*"/g, ""); } // ── Context Formatting ──────────────────────────────────────── @@ -313,55 +336,61 @@ export function formatContext(matches: SkillMatch[]): string { `[OMA SKILLS DETECTED: ${matches.map((m) => m.name).join(", ")}]`, "User intent matches the following skills:", "", - ] + ]; for (const m of matches) { - lines.push(`- **${m.name}** — \`${m.relPath}\``) - lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`) + lines.push(`- **${m.name}** — \`${m.relPath}\``); + lines.push(` Matched triggers: ${m.matchedTriggers.join(", ")}`); } - lines.push("") - lines.push("Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.") - return lines.join("\n") + lines.push(""); + lines.push( + "Read the relevant SKILL.md before invoking. These suggestions are advisory — apply judgement.", + ); + return lines.join("\n"); } // ── Main ────────────────────────────────────────────────────── async function main() { - const raw = readFileSync("/dev/stdin", "utf-8") - let input: Record<string, unknown> + const raw = readFileSync("/dev/stdin", "utf-8"); + let input: Record<string, unknown>; try { - input = JSON.parse(raw) + input = JSON.parse(raw); } catch { - process.exit(0) + process.exit(0); } - const vendor = detectVendor(input) - const projectDir = getProjectDir(vendor, input) - const sessionId = getSessionId(input) - const prompt = (input.prompt as string) ?? "" - - if (!prompt.trim()) process.exit(0) - if (startsWithSlashCommand(prompt)) process.exit(0) - if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0) - - const lang = detectLanguage(projectDir) - const config = loadTriggersConfig() - const cleaned = stripCodeBlocks(prompt) - const skills = discoverSkills(projectDir) - - const matches = matchSkills(cleaned, lang, skills, config) - if (matches.length === 0) process.exit(0) - - const { fresh, nextState } = filterFreshMatches(matches, projectDir, sessionId) - if (fresh.length === 0) process.exit(0) - - writeState(projectDir, nextState) - process.stdout.write(makePromptOutput(vendor, formatContext(fresh))) - process.exit(0) + const vendor = detectVendor(input); + const projectDir = getProjectDir(vendor, input); + const sessionId = getSessionId(input); + const prompt = (input.prompt as string) ?? ""; + + if (!prompt.trim()) process.exit(0); + if (startsWithSlashCommand(prompt)) process.exit(0); + if (isPersistentWorkflowActive(projectDir, sessionId)) process.exit(0); + + const lang = detectLanguage(projectDir); + const config = loadTriggersConfig(); + const cleaned = stripCodeBlocks(prompt); + const skills = discoverSkills(projectDir); + + const matches = matchSkills(cleaned, lang, skills, config); + if (matches.length === 0) process.exit(0); + + const { fresh, nextState } = filterFreshMatches( + matches, + projectDir, + sessionId, + ); + if (fresh.length === 0) process.exit(0); + + writeState(projectDir, nextState); + process.stdout.write(makePromptOutput(vendor, formatContext(fresh))); + process.exit(0); } if (import.meta.main) { - main().catch(() => process.exit(0)) + main().catch(() => process.exit(0)); } // Avoid unused-import lint for basename when testing subsets of this module. -void basename +void basename; diff --git a/.qwen/hooks/test-filter.ts b/.qwen/hooks/test-filter.ts index a0ce2fc..a3ad992 100644 --- a/.qwen/hooks/test-filter.ts +++ b/.qwen/hooks/test-filter.ts @@ -1,51 +1,61 @@ // PreToolUse hook — Filter test output to show only failures // Works with: Claude Code, Codex CLI, Gemini CLI, Qwen Code -import { existsSync } from "node:fs" -import { join } from "node:path" -import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts" +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { makePreToolOutput, resolveGitRoot, type Vendor } from "./types.ts"; // --- Vendor detection (same logic as keyword-detector.ts) --- +function inferVendorFromScriptPath(): Vendor | null { + const path = import.meta.path; + if (path.includes(`${join(".cursor", "hooks")}`)) return "cursor"; + if (path.includes(`${join(".qwen", "hooks")}`)) return "qwen"; + if (path.includes(`${join(".claude", "hooks")}`)) return "claude"; + if (path.includes(`${join(".gemini", "hooks")}`)) return "gemini"; + if (path.includes(`${join(".codex", "hooks")}`)) return "codex"; + return null; +} + function detectVendor(input: Record<string, unknown>): Vendor { - const event = input.hook_event_name as string | undefined - if (event === "BeforeTool") return "gemini" - if (event === "PreToolUse") { - if ("session_id" in input && !("sessionId" in input)) return "codex" - } - if (process.env.QWEN_PROJECT_DIR) return "qwen" - return "claude" + const event = input.hook_event_name as string | undefined; + const byScriptPath = inferVendorFromScriptPath(); + if (byScriptPath) return byScriptPath; + if (event === "BeforeTool") return "gemini"; + if (event === "PreToolUse" && "session_id" in input) return "codex"; + if (process.env.QWEN_PROJECT_DIR) return "qwen"; + return "claude"; } function getProjectDir(vendor: Vendor, input: Record<string, unknown>): string { - let dir: string + let dir: string; switch (vendor) { case "codex": - dir = (input.cwd as string) || process.cwd() - break + dir = (input.cwd as string) || process.cwd(); + break; case "gemini": - dir = process.env.GEMINI_PROJECT_DIR || process.cwd() - break + dir = process.env.GEMINI_PROJECT_DIR || process.cwd(); + break; case "qwen": - dir = process.env.QWEN_PROJECT_DIR || process.cwd() - break + dir = process.env.QWEN_PROJECT_DIR || process.cwd(); + break; default: - dir = process.env.CLAUDE_PROJECT_DIR || process.cwd() - break + dir = process.env.CLAUDE_PROJECT_DIR || process.cwd(); + break; } - return resolveGitRoot(dir) + return resolveGitRoot(dir); } function getHookDir(vendor: Vendor): string { switch (vendor) { case "codex": - return ".codex/hooks" + return ".codex/hooks"; case "gemini": - return ".gemini/hooks" + return ".gemini/hooks"; case "qwen": - return ".qwen/hooks" + return ".qwen/hooks"; default: - return ".claude/hooks" + return ".claude/hooks"; } } @@ -78,66 +88,70 @@ const TEST_PATTERNS = [ /\brspec\b/, /\bmix\s+test\b/, /\bphpunit\b/, -] +]; // Commands that mention test runners but aren't running tests const EXCLUDE_PATTERNS = [ /\b(install|add|remove|uninstall|init)\b/, /\b(cat|head|tail|less|more|wc)\b.*\.(test|spec)\./, -] +]; // --- Hook input --- interface PreToolUseInput { - tool_name: string + tool_name: string; tool_input: { - command?: string - [key: string]: unknown - } - hook_event_name?: string - session_id?: string - sessionId?: string - cwd?: string + command?: string; + [key: string]: unknown; + }; + hook_event_name?: string; + session_id?: string; + sessionId?: string; + cwd?: string; } // --- Main --- -const raw = await Bun.stdin.text() -if (!raw.trim()) process.exit(0) +const raw = await Bun.stdin.text(); +if (!raw.trim()) process.exit(0); -const input: PreToolUseInput = JSON.parse(raw) +const input: PreToolUseInput = JSON.parse(raw); // Gemini uses run_shell_command; Claude-family uses Bash. if (input.tool_name !== "Bash" && input.tool_name !== "run_shell_command") { - process.exit(0) + process.exit(0); } -const command = input.tool_input?.command -if (!command) process.exit(0) +const command = input.tool_input?.command; +if (!command) process.exit(0); // Check if this is a test command -const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)) -if (!isTestCommand) process.exit(0) +const isTestCommand = TEST_PATTERNS.some((p) => p.test(command)); +if (!isTestCommand) process.exit(0); // Skip if it's a non-test use of test tool names (install, cat, etc.) -const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)) -if (isExcluded) process.exit(0) +const isExcluded = EXCLUDE_PATTERNS.some((p) => p.test(command)); +if (isExcluded) process.exit(0); // Detect vendor and resolve project dir -const vendor = detectVendor(input) -const projectDir = getProjectDir(vendor, input) -const filterScript = join(projectDir, getHookDir(vendor), "filter-test-output.sh") +const vendor = detectVendor(input); +const projectDir = getProjectDir(vendor, input); +const filterScript = join( + projectDir, + getHookDir(vendor), + "filter-test-output.sh", +); // Skip filtering if the script doesn't exist (hooks not fully installed) -if (!existsSync(filterScript)) process.exit(0) +if (!existsSync(filterScript)) process.exit(0); // Rewrite command to pipe through filter -const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"` +const filteredCmd = `set -o pipefail; (${command}) 2>&1 | bash "${filterScript}"`; // Return updated input with all original fields preserved const updatedInput: Record<string, unknown> = { ...input.tool_input, command: filteredCmd, -} +}; -console.log(makePreToolOutput(vendor, updatedInput)) +console.log(makePreToolOutput(vendor, updatedInput)); diff --git a/.qwen/hooks/triggers.json b/.qwen/hooks/triggers.json index f404583..0a1513f 100644 --- a/.qwen/hooks/triggers.json +++ b/.qwen/hooks/triggers.json @@ -43,9 +43,35 @@ "全部お願い", "まとめてやって" ], - "zh": ["编排", "并行执行", "自动执行", "全部执行", "全部做", "自动处理", "一起做", "全做了", "帮我全做"], - "es": ["orquestar", "paralelo", "ejecutar todo", "hazlo todo", "ejecuta todo", "automatiza", "haz todo"], - "fr": ["orchestrer", "parallèle", "tout exécuter", "fais tout", "exécute tout", "automatise", "gère tout"], + "zh": [ + "编排", + "并行执行", + "自动执行", + "全部执行", + "全部做", + "自动处理", + "一起做", + "全做了", + "帮我全做" + ], + "es": [ + "orquestar", + "paralelo", + "ejecutar todo", + "hazlo todo", + "ejecuta todo", + "automatiza", + "haz todo" + ], + "fr": [ + "orchestrer", + "parallèle", + "tout exécuter", + "fais tout", + "exécute tout", + "automatise", + "gère tout" + ], "de": [ "orchestrieren", "parallel", @@ -55,7 +81,15 @@ "automatisieren", "alles auf einmal" ], - "pt": ["orquestrar", "paralelo", "executar tudo", "faça tudo", "execute tudo", "automatize", "resolva tudo"], + "pt": [ + "orquestrar", + "paralelo", + "executar tudo", + "faça tudo", + "execute tudo", + "automatize", + "resolva tudo" + ], "ru": [ "оркестровать", "параллельно", @@ -128,7 +162,16 @@ "トレードオフ", "品質特性" ], - "zh": ["架构", "系统设计", "软件设计", "架构评审", "模块边界", "服务边界", "权衡分析", "质量属性"], + "zh": [ + "架构", + "系统设计", + "软件设计", + "架构评审", + "模块边界", + "服务边界", + "权衡分析", + "质量属性" + ], "es": [ "arquitectura", "diseño de sistemas", @@ -205,7 +248,15 @@ "persistent": false, "keywords": { "*": ["task breakdown"], - "en": ["plan", "make a plan", "create a plan", "break down", "analyze requirements", "plan this", "decompose"], + "en": [ + "plan", + "make a plan", + "create a plan", + "break down", + "analyze requirements", + "plan this", + "decompose" + ], "ko": [ "계획", "요구사항 분석", @@ -235,7 +286,16 @@ "設計して", "プランを作って" ], - "zh": ["计划", "需求分析", "任务分解", "制定计划", "做个计划", "分析一下", "拆分任务", "规划一下"], + "zh": [ + "计划", + "需求分析", + "任务分解", + "制定计划", + "做个计划", + "分析一下", + "拆分任务", + "规划一下" + ], "es": [ "plan", "planificar", @@ -286,7 +346,15 @@ "разбей на задачи", "спланируй" ], - "nl": ["plan", "plannen", "vereistenanalyse", "maak een plan", "analyseer", "splits op", "plan dit"], + "nl": [ + "plan", + "plannen", + "vereistenanalyse", + "maak een plan", + "analyseer", + "splits op", + "plan dit" + ], "pl": [ "plan", "planować", @@ -303,7 +371,15 @@ "persistent": false, "keywords": { "*": ["code review", "security audit", "security review"], - "en": ["review", "review this", "review my code", "check my code", "audit", "inspect", "code check"], + "en": [ + "review", + "review this", + "review my code", + "check my code", + "audit", + "inspect", + "code check" + ], "ko": [ "리뷰", "코드 검토", @@ -330,7 +406,17 @@ "点検して", "コード確認" ], - "zh": ["审查", "代码审查", "安全审计", "审查一下", "检查一下", "看看代码", "检查代码", "代码检查", "安全检查"], + "zh": [ + "审查", + "代码审查", + "安全审计", + "审查一下", + "检查一下", + "看看代码", + "检查代码", + "代码检查", + "安全检查" + ], "es": [ "revisión", "revisar código", @@ -613,7 +699,17 @@ "アイデアちょうだい", "一緒に考えよう" ], - "zh": ["头脑风暴", "创意", "设计探索", "想想", "出主意", "有什么想法", "想个办法", "出点子", "集思广益"], + "zh": [ + "头脑风暴", + "创意", + "设计探索", + "想想", + "出主意", + "有什么想法", + "想个办法", + "出点子", + "集思广益" + ], "es": [ "lluvia de ideas", "idear", @@ -644,7 +740,16 @@ "vorschläge", "lass uns überlegen" ], - "pt": ["brainstorming", "idear", "explorar design", "pense em", "e se", "ideias para", "sugira", "imagine"], + "pt": [ + "brainstorming", + "idear", + "explorar design", + "pense em", + "e se", + "ideias para", + "sugira", + "imagine" + ], "ru": [ "мозговой штурм", "идеи", @@ -681,7 +786,13 @@ "persistent": true, "keywords": { "*": ["work", "step by step"], - "en": ["one by one", "guide me", "walk me through", "manual mode", "one step at a time"], + "en": [ + "one by one", + "guide me", + "walk me through", + "manual mode", + "one step at a time" + ], "ko": [ "단계별", "단계별로", @@ -693,9 +804,32 @@ "차근차근 해줘", "수동으로 해줘" ], - "ja": ["ステップバイステップ", "一歩ずつ", "ガイドして", "手動で", "一つずつ", "順番にやって", "手順を教えて"], - "zh": ["逐步", "一步一步", "指导我", "手动", "一个一个", "按顺序", "带我做"], - "es": ["paso a paso", "guíame", "uno por uno", "modo manual", "de a uno", "llévame paso a paso"], + "ja": [ + "ステップバイステップ", + "一歩ずつ", + "ガイドして", + "手動で", + "一つずつ", + "順番にやって", + "手順を教えて" + ], + "zh": [ + "逐步", + "一步一步", + "指导我", + "手动", + "一个一个", + "按顺序", + "带我做" + ], + "es": [ + "paso a paso", + "guíame", + "uno por uno", + "modo manual", + "de a uno", + "llévame paso a paso" + ], "fr": [ "étape par étape", "guide-moi", @@ -712,8 +846,22 @@ "zeig mir wie", "der reihe nach" ], - "pt": ["passo a passo", "me guie", "um por um", "modo manual", "me acompanhe", "me mostre passo a passo"], - "ru": ["шаг за шагом", "направь меня", "по одному", "ручной режим", "покажи по шагам", "веди меня"], + "pt": [ + "passo a passo", + "me guie", + "um por um", + "modo manual", + "me acompanhe", + "me mostre passo a passo" + ], + "ru": [ + "шаг за шагом", + "направь меня", + "по одному", + "ручной режим", + "покажи по шагам", + "веди меня" + ], "nl": [ "stap voor stap", "begeleid me", @@ -736,7 +884,14 @@ "persistent": false, "keywords": { "*": ["deepinit"], - "en": ["init project", "initialize", "setup project", "new project", "scaffold", "bootstrap"], + "en": [ + "init project", + "initialize", + "setup project", + "new project", + "scaffold", + "bootstrap" + ], "ko": [ "프로젝트 초기화", "코드베이스 초기화", @@ -757,7 +912,15 @@ "プロジェクトを作って", "プロジェクト設定" ], - "zh": ["项目初始化", "新项目", "设置项目", "搭建项目", "初始化", "创建项目", "项目配置"], + "zh": [ + "项目初始化", + "新项目", + "设置项目", + "搭建项目", + "初始化", + "创建项目", + "项目配置" + ], "es": [ "inicializar proyecto", "nuevo proyecto", @@ -1381,8 +1544,20 @@ "define boundaries", "architecture tradeoffs" ], - "ko": ["아키텍처 짜줘", "시스템 구조 설계", "경계 정의해줘", "구조 검토해줘", "아키텍처 문서"], - "ja": ["アーキテクチャを設計", "システム構成を考えて", "境界を定義", "構成レビュー", "アーキ文書"], + "ko": [ + "아키텍처 짜줘", + "시스템 구조 설계", + "경계 정의해줘", + "구조 검토해줘", + "아키텍처 문서" + ], + "ja": [ + "アーキテクチャを設計", + "システム構成を考えて", + "境界を定義", + "構成レビュー", + "アーキ文書" + ], "zh": ["设计架构", "系统架构方案", "定义边界", "架构文档", "架构权衡"] } }, @@ -1403,17 +1578,45 @@ "server implementation", "clean architecture" ], - "ko": ["api 만들어줘", "엔드포인트 추가", "백엔드 구현", "마이그레이션 작성", "인증 붙여줘"], - "ja": ["apiを作って", "エンドポイント追加", "バックエンド実装", "マイグレーション書いて", "認証を実装"], + "ko": [ + "api 만들어줘", + "엔드포인트 추가", + "백엔드 구현", + "마이그레이션 작성", + "인증 붙여줘" + ], + "ja": [ + "apiを作って", + "エンドポイント追加", + "バックエンド実装", + "マイグレーション書いて", + "認証を実装" + ], "zh": ["写个接口", "加接口", "后端实现", "写迁移", "加认证"] } }, "oma-brainstorm": { "keywords": { "*": [], - "en": ["toss around ideas", "kick around options", "spitball", "some ideas please", "ideation session"], - "ko": ["아이디어 좀 뽑아줘", "같이 고민해줘", "아이디어 내보자", "방향성 고민"], - "ja": ["アイデア出して", "一緒に考えて", "方向性を探りたい", "案を出して"], + "en": [ + "toss around ideas", + "kick around options", + "spitball", + "some ideas please", + "ideation session" + ], + "ko": [ + "아이디어 좀 뽑아줘", + "같이 고민해줘", + "아이디어 내보자", + "방향성 고민" + ], + "ja": [ + "アイデア出して", + "一緒に考えて", + "方向性を探りたい", + "案を出して" + ], "zh": ["帮我想想", "一起想想办法", "给点灵感"] } }, @@ -1430,8 +1633,18 @@ "cli handoff", "manual orchestration" ], - "ko": ["에이전트 조율", "에이전트끼리 협업", "수동으로 에이전트 돌려", "에이전트 순서 잡아줘"], - "ja": ["エージェントを調整", "エージェント連携", "手動でエージェント", "エージェントの順序"], + "ko": [ + "에이전트 조율", + "에이전트끼리 협업", + "수동으로 에이전트 돌려", + "에이전트 순서 잡아줘" + ], + "ja": [ + "エージェントを調整", + "エージェント連携", + "手動でエージェント", + "エージェントの順序" + ], "zh": ["协调代理", "代理之间协作", "手动跑代理", "代理之间衔接"] } }, @@ -1454,8 +1667,20 @@ "data migration", "capacity planning" ], - "ko": ["스키마 설계", "테이블 설계", "인덱스 튜닝", "쿼리 느려", "용량 산정"], - "ja": ["スキーマ設計", "テーブル設計", "インデックス調整", "クエリが遅い", "容量見積"], + "ko": [ + "스키마 설계", + "테이블 설계", + "인덱스 튜닝", + "쿼리 느려", + "용량 산정" + ], + "ja": [ + "スキーマ設計", + "テーブル設計", + "インデックス調整", + "クエリが遅い", + "容量見積" + ], "zh": ["设计表结构", "表设计", "索引优化", "查询很慢", "容量评估"] } }, @@ -1474,8 +1699,20 @@ "crash fix", "error investigation" ], - "ko": ["버그 찾아줘", "에러 원인", "크래시 분석", "스택트레이스 봐줘", "원인 파악해줘"], - "ja": ["バグを探して", "エラー原因", "クラッシュを分析", "スタックトレースを見て", "原因を特定"], + "ko": [ + "버그 찾아줘", + "에러 원인", + "크래시 분석", + "스택트레이스 봐줘", + "원인 파악해줘" + ], + "ja": [ + "バグを探して", + "エラー原因", + "クラッシュを分析", + "スタックトレースを見て", + "原因を特定" + ], "zh": ["找出 bug", "错误原因", "分析崩溃", "看堆栈", "定位原因"] } }, @@ -1493,8 +1730,19 @@ "responsive layout", "motion design" ], - "ko": ["디자인 토큰", "랜딩 만들어줘", "컬러 팔레트 잡아줘", "타이포 스케일", "모션 가이드"], - "ja": ["デザイントークン", "ランディング作成", "カラーパレット決めて", "モーション設計"], + "ko": [ + "디자인 토큰", + "랜딩 만들어줘", + "컬러 팔레트 잡아줘", + "타이포 스케일", + "모션 가이드" + ], + "ja": [ + "デザイントークン", + "ランディング作成", + "カラーパレット決めて", + "モーション設計" + ], "zh": ["设计令牌", "做个落地页", "定配色", "字体层级", "动效规范"] } }, @@ -1514,9 +1762,27 @@ "release automation", "build automation" ], - "ko": ["mise 태스크", "ci 파이프라인", "릴리즈 자동화", "깃 훅 설정", "모노레포 워크플로우"], - "ja": ["miseタスク", "ciパイプライン", "リリース自動化", "gitフック", "モノレポ作業"], - "zh": ["mise 任务", "ci 流水线", "发布自动化", "git 钩子", "monorepo 工作流"] + "ko": [ + "mise 태스크", + "ci 파이프라인", + "릴리즈 자동화", + "깃 훅 설정", + "모노레포 워크플로우" + ], + "ja": [ + "miseタスク", + "ciパイプライン", + "リリース自動化", + "gitフック", + "モノレポ作業" + ], + "zh": [ + "mise 任务", + "ci 流水线", + "发布自动化", + "git 钩子", + "monorepo 工作流" + ] } }, "oma-frontend": { @@ -1534,9 +1800,27 @@ "frontend ui", "FSD architecture" ], - "ko": ["리액트 컴포넌트", "넥스트 페이지", "tailwind로 스타일", "shadcn 붙여줘", "프론트 구현"], - "ja": ["reactコンポーネント", "nextページ", "tailwindで装飾", "shadcn導入", "フロント実装"], - "zh": ["写个 react 组件", "next 页面", "用 tailwind", "接入 shadcn", "前端实现"] + "ko": [ + "리액트 컴포넌트", + "넥스트 페이지", + "tailwind로 스타일", + "shadcn 붙여줘", + "프론트 구현" + ], + "ja": [ + "reactコンポーネント", + "nextページ", + "tailwindで装飾", + "shadcn導入", + "フロント実装" + ], + "zh": [ + "写个 react 组件", + "next 页面", + "用 tailwind", + "接入 shadcn", + "前端实现" + ] } }, "oma-hwp": { @@ -1551,7 +1835,16 @@ "hangul word processor", "hwp ingestion" ], - "ko": ["한글 파일", "한글 변환", "한글 파싱", "hwp 변환", "hwp 파싱", "hwp 마크다운", "hwpx 변환", "hwpx 파싱"], + "ko": [ + "한글 파일", + "한글 변환", + "한글 파싱", + "hwp 변환", + "hwp 파싱", + "hwp 마크다운", + "hwpx 변환", + "hwpx 파싱" + ], "ja": ["hwp変換", "hwpをマークダウン", "hwpを解析", "韓国語ワープロ"], "zh": ["hwp 转换", "hwp 解析", "hwp 转 markdown", "韩文文档"] } @@ -1571,9 +1864,233 @@ "mobile app", "android ios" ], - "ko": ["플러터 화면", "리액트 네이티브 화면", "다트 위젯", "안드로이드 아이폰 앱", "모바일 앱"], - "ja": ["flutter画面", "react native画面", "dartウィジェット", "iosアンドロイド", "モバイルアプリ"], - "zh": ["flutter 页面", "react native 页面", "dart 组件", "安卓 ios", "移动端应用"] + "ko": [ + "플러터 화면", + "리액트 네이티브 화면", + "다트 위젯", + "안드로이드 아이폰 앱", + "모바일 앱" + ], + "ja": [ + "flutter画面", + "react native画面", + "dartウィジェット", + "iosアンドロイド", + "モバイルアプリ" + ], + "zh": [ + "flutter 页面", + "react native 页面", + "dart 组件", + "安卓 ios", + "移动端应用" + ] + } + }, + "oma-observability": { + "keywords": { + "*": [ + "OpenTelemetry", + "OTel", + "OTLP", + "W3C Trace Context", + "traceparent", + "MELT", + "APM", + "RUM", + "SLO", + "SLI", + "burn-rate", + "PromQL", + "Prometheus", + "Grafana", + "Jaeger", + "Tempo", + "Loki", + "Mimir", + "Fluent Bit", + "OpenCost", + "OpenFeature", + "Flagger", + "Falco", + "Parca", + "Pyroscope", + "Honeycomb", + "Datadog", + "Sentry", + "Crashlytics", + "Core Web Vitals" + ], + "en": [ + "observability", + "traceability", + "telemetry", + "distributed tracing", + "instrument my service", + "set up OTel", + "OTel pipeline", + "collector topology", + "tail sampling", + "cardinality budget", + "clock skew", + "error budget", + "burn rate alert", + "canary analysis", + "progressive delivery", + "feature flag observability", + "incident forensics", + "6-dimension localization", + "root cause across services", + "multi-tenant telemetry", + "per-tenant sampling", + "data residency telemetry", + "redact PII in logs", + "observability as code", + "dashboard as code", + "PrometheusRule CRD", + "Grafana Jsonnet", + "Perses dashboard", + "UDP MTU telemetry", + "StatsD fragmentation", + "OTLP gRPC vs HTTP", + "propagator matrix", + "BGP observability", + "QUIC observability", + "eBPF observability", + "service mesh tracing", + "zero code instrumentation", + "mobile crash analytics", + "crash-free rate", + "symbolication pipeline", + "offline telemetry queue" + ], + "ko": [ + "관측성", + "관측 가능성", + "추적성", + "추적 가능성", + "텔레메트리", + "텔레메트리 수집", + "분산 트레이싱", + "OTel 도입", + "OTel 셋업", + "OTel 계측", + "OTel 파이프라인", + "컬렉터 토폴로지", + "테일 샘플링", + "카디널리티", + "카디널리티 관리", + "클록 스큐", + "시계 드리프트", + "에러 버짓", + "에러 예산", + "번레이트 알람", + "번레이트", + "카나리 분석", + "프로그레시브 딜리버리", + "점진 배포", + "피처 플래그 관측", + "사건 부검", + "장애 부검", + "장애 원인 분석", + "6차원 좁히기", + "멀티테넌트 관측", + "테넌트별 샘플링", + "데이터 거주 관측", + "로그 PII 제거", + "로그 익명화", + "로그 가명화", + "관측성 as code", + "대시보드 as code", + "대시보드 코드화", + "PrometheusRule", + "Grafana Jsonnet", + "Perses 대시보드", + "UDP MTU 튜닝", + "StatsD 단편화", + "OTLP gRPC 선택", + "전파자 매핑", + "BGP 관측", + "QUIC 관측", + "eBPF 관측", + "서비스 메시 트레이싱", + "zero-code 계측", + "모바일 크래시 분석", + "크래시 프리 레이트", + "심볼리케이션", + "오프라인 텔레메트리 큐" + ], + "ja": [ + "オブザーバビリティ", + "トレーサビリティ", + "テレメトリ", + "分散トレーシング", + "OTel導入", + "OTelパイプライン", + "コレクタ構成", + "テイルサンプリング", + "カーディナリティ予算", + "クロックスキュー", + "エラーバジェット", + "バーンレートアラート", + "カナリア分析", + "プログレッシブデリバリ", + "機能フラグ観測", + "インシデントフォレンジック", + "マルチテナント観測", + "データ居住性観測", + "ログPII除去", + "Observability as Code", + "Dashboard as Code", + "UDP MTUチューニング", + "StatsDフラグメンテーション", + "OTLP選択", + "プロパゲータマッピング", + "BGP観測", + "QUIC観測", + "eBPF観測", + "サービスメッシュトレース", + "モバイルクラッシュ分析", + "クラッシュフリーレート", + "シンボリケーション", + "オフラインテレメトリ" + ], + "zh": [ + "可观测性", + "可追溯性", + "遥测", + "分布式追踪", + "OTel 接入", + "OTel 流水线", + "采集器拓扑", + "尾采样", + "基数预算", + "时钟漂移", + "错误预算", + "燃烧率告警", + "金丝雀分析", + "渐进式发布", + "特性开关观测", + "事件取证", + "多租户观测", + "数据驻留观测", + "日志脱敏", + "可观测性即代码", + "仪表盘即代码", + "UDP MTU 调优", + "StatsD 分片", + "OTLP 选择", + "传播器映射", + "BGP 观测", + "QUIC 观测", + "eBPF 观测", + "服务网格追踪", + "零代码探针", + "移动崩溃分析", + "崩溃无事率", + "符号化", + "离线遥测队列" + ] } }, "oma-orchestrator": { @@ -1590,8 +2107,18 @@ "review loop", "mcp memory coordination" ], - "ko": ["에이전트 병렬 실행", "동시에 에이전트 돌려", "fan-out", "리뷰 루프 돌려"], - "ja": ["エージェント並列実行", "同時にエージェント", "fan-out", "レビューループ"], + "ko": [ + "에이전트 병렬 실행", + "동시에 에이전트 돌려", + "fan-out", + "리뷰 루프 돌려" + ], + "ja": [ + "エージェント並列実行", + "同時にエージェント", + "fan-out", + "レビューループ" + ], "zh": ["并行跑代理", "同时派发代理", "fan-out 任务", "评审循环"] } }, @@ -1628,8 +2155,20 @@ "scope definition", "prioritization matrix" ], - "ko": ["요구사항 정리", "스펙 문서", "우선순위 매겨줘", "스코프 정의", "제품 로드맵"], - "ja": ["要件を整理", "スペック作成", "優先度付け", "スコープ定義", "プロダクトロードマップ"], + "ko": [ + "요구사항 정리", + "스펙 문서", + "우선순위 매겨줘", + "스코프 정의", + "제품 로드맵" + ], + "ja": [ + "要件を整理", + "スペック作成", + "優先度付け", + "スコープ定義", + "プロダクトロードマップ" + ], "zh": ["梳理需求", "写规格书", "排优先级", "界定范围", "产品路线图"] } }, @@ -1647,7 +2186,12 @@ "test coverage" ], "ko": ["접근성 점검", "성능 점검", "커버리지 확인", "품질 게이트"], - "ja": ["アクセシビリティ確認", "パフォーマンス点検", "カバレッジ確認", "品質ゲート"], + "ja": [ + "アクセシビリティ確認", + "パフォーマンス点検", + "カバレッジ確認", + "品質ゲート" + ], "zh": ["无障碍检查", "性能检查", "覆盖率报告", "质量门禁"] } }, @@ -1666,8 +2210,20 @@ "transcript analysis", "multi tool recap" ], - "ko": ["오늘 한 일 정리", "하루 요약", "주간 요약", "작업 내용 정리", "대화 요약"], - "ja": ["今日の作業まとめ", "日次サマリ", "週次サマリ", "作業振り返り", "会話まとめ"], + "ko": [ + "오늘 한 일 정리", + "하루 요약", + "주간 요약", + "작업 내용 정리", + "대화 요약" + ], + "ja": [ + "今日の作業まとめ", + "日次サマリ", + "週次サマリ", + "作業振り返り", + "会話まとめ" + ], "zh": ["今天做了什么", "日报总结", "周报总结", "工作回顾", "对话总结"] } }, @@ -1685,7 +2241,12 @@ "git worktree" ], "ko": ["머지 충돌 해결", "리베이스해줘", "워크트리 써줘"], - "ja": ["マージ衝突解決", "リベースして", "リリースタグ", "worktree使って"], + "ja": [ + "マージ衝突解決", + "リベースして", + "リリースタグ", + "worktree使って" + ], "zh": ["解决合并冲突", "帮我 rebase", "打发布标签", "用 worktree"] } }, @@ -1705,8 +2266,20 @@ "library reference", "context7 docs" ], - "ko": ["검색해줘", "찾아줘", "레퍼런스 찾아", "문서 찾아줘", "라이브러리 찾아줘"], - "ja": ["検索して", "調べて", "ドキュメント探して", "ライブラリ調べて", "リファレンス探して"], + "ko": [ + "검색해줘", + "찾아줘", + "레퍼런스 찾아", + "문서 찾아줘", + "라이브러리 찾아줘" + ], + "ja": [ + "検索して", + "調べて", + "ドキュメント探して", + "ライブラリ調べて", + "リファレンス探して" + ], "zh": ["帮我查", "搜一下", "找找文档", "找个库", "查参考资料"] } }, @@ -1725,9 +2298,27 @@ "oidc setup", "cost optimization" ], - "ko": ["테라폼 플랜", "인프라 프로비저닝", "iac 모듈", "클라우드 리소스", "비용 최적화"], - "ja": ["terraformプラン", "インフラ構築", "iacモジュール", "クラウドリソース", "コスト最適化"], - "zh": ["terraform plan", "搭建基础设施", "iac 模块", "云资源", "成本优化"] + "ko": [ + "테라폼 플랜", + "인프라 프로비저닝", + "iac 모듈", + "클라우드 리소스", + "비용 최적화" + ], + "ja": [ + "terraformプラン", + "インフラ構築", + "iacモジュール", + "クラウドリソース", + "コスト最適化" + ], + "zh": [ + "terraform plan", + "搭建基础设施", + "iac 模块", + "云资源", + "成本优化" + ] } }, "oma-translator": { @@ -1744,10 +2335,104 @@ "multilingual content", "arb translation" ], - "ko": ["번역해줘", "번역 부탁", "다국어로", "영어로 바꿔줘", "현지화해줘"], + "ko": [ + "번역해줘", + "번역 부탁", + "다국어로", + "영어로 바꿔줘", + "현지화해줘" + ], "ja": ["翻訳して", "英訳", "多言語化", "ローカライズして", "訳して"], "zh": ["翻译一下", "帮我翻译", "多语言", "本地化", "翻成英文"] } + }, + "oma-image": { + "keywords": { + "*": [ + "nano-banana", + "nanobanana", + "gpt-image", + "pollinations", + "oma-image" + ], + "en": [ + "generate image", + "generate an image", + "create image", + "create an image", + "make a picture", + "make an image", + "render image", + "render a picture", + "draw me", + "draw a", + "ai image", + "image generation", + "generate a photo", + "create picture", + "picture of", + "image of" + ], + "ko": [ + "이미지 만들어", + "이미지 만들어줘", + "이미지 생성", + "이미지 생성해", + "이미지 생성해줘", + "사진 만들어", + "사진 만들어줘", + "그림 그려", + "그림 그려줘", + "이미지 뽑아", + "이미지 뽑아줘", + "이미지 그려줘", + "이미지 출력", + "나노바나나", + "나노 바나나", + "바나나로 뽑", + "이미지 생성기", + "ai 이미지" + ], + "ja": [ + "画像を生成", + "画像生成", + "画像を作", + "画像を作成", + "絵を描いて", + "画像出力", + "イラストを生成", + "写真を生成" + ], + "zh": [ + "生成图像", + "生成图片", + "生成一张", + "画一张", + "画一幅", + "帮我画", + "出图", + "图像生成", + "图片生成" + ], + "es": [ + "generar imagen", + "crear imagen", + "hazme una imagen", + "genera una foto" + ], + "fr": [ + "générer une image", + "créer une image", + "fais-moi une image", + "dessine-moi" + ], + "de": [ + "bild generieren", + "bild erstellen", + "erstelle ein bild", + "zeichne mir" + ] + } } }, "informationalPatterns": { @@ -1768,22 +2453,108 @@ "是什么", "とは" ], - "ko": ["뭐야", "뭐임", "무엇", "어떻게", "설명해", "알려줘", "키워드", "감지", "오탐"], - "ja": ["とは", "って何", "どうやって", "説明して", "キーワード", "検出", "誤検出"], + "ko": [ + "뭐야", + "뭐임", + "무엇", + "어떻게", + "설명해", + "알려줘", + "키워드", + "감지", + "오탐" + ], + "ja": [ + "とは", + "って何", + "どうやって", + "説明して", + "キーワード", + "検出", + "誤検出" + ], "zh": ["是什么", "什么是", "怎么", "解释", "关键词", "检测", "误报"], - "es": ["qué es", "cómo", "explica", "palabra clave", "falso positivo", "detectado"], - "fr": ["c'est quoi", "comment", "explique", "mot-clé", "faux positif", "détecté"], - "de": ["was ist", "wie", "erkläre", "schlüsselwort", "falsch positiv", "erkannt"], - "pt": ["o que é", "como", "explique", "palavra-chave", "falso positivo", "detectado"], - "ru": ["что такое", "как", "объясни", "ключевое слово", "ложное срабатывание", "обнаружено"], - "nl": ["wat is", "hoe", "leg uit", "sleutelwoord", "vals positief", "gedetecteerd"], - "pl": ["co to", "jak", "wyjaśnij", "słowo kluczowe", "fałszywy alarm", "wykryto"] + "es": [ + "qué es", + "cómo", + "explica", + "palabra clave", + "falso positivo", + "detectado" + ], + "fr": [ + "c'est quoi", + "comment", + "explique", + "mot-clé", + "faux positif", + "détecté" + ], + "de": [ + "was ist", + "wie", + "erkläre", + "schlüsselwort", + "falsch positiv", + "erkannt" + ], + "pt": [ + "o que é", + "como", + "explique", + "palavra-chave", + "falso positivo", + "detectado" + ], + "ru": [ + "что такое", + "как", + "объясни", + "ключевое слово", + "ложное срабатывание", + "обнаружено" + ], + "nl": [ + "wat is", + "hoe", + "leg uit", + "sleutelwoord", + "vals positief", + "gedetecteerd" + ], + "pl": [ + "co to", + "jak", + "wyjaśnij", + "słowo kluczowe", + "fałszywy alarm", + "wykryto" + ] }, "excludedWorkflows": ["tools", "stack-set", "exec-plan"], "cjkScripts": ["ko", "ja", "zh"], "extensionRouting": { - "frontend-engineer": ["tsx", "jsx", "css", "scss", "less", "vue", "svelte", "html"], - "backend-engineer": ["go", "py", "java", "rs", "rb", "php", "controller", "service", "resolver"], + "frontend-engineer": [ + "tsx", + "jsx", + "css", + "scss", + "less", + "vue", + "svelte", + "html" + ], + "backend-engineer": [ + "go", + "py", + "java", + "rs", + "rb", + "php", + "controller", + "service", + "resolver" + ], "db-engineer": ["sql", "prisma", "graphql", "migration"], "mobile-engineer": ["dart", "swift", "kt", "xib", "storyboard"], "designer": ["figma", "sketch", "svg"] diff --git a/.qwen/hooks/types.ts b/.qwen/hooks/types.ts index f9bf420..fd54f3e 100644 --- a/.qwen/hooks/types.ts +++ b/.qwen/hooks/types.ts @@ -1,8 +1,8 @@ // Claude Code Hook Types for oh-my-agent // Shared across Claude Code, Codex CLI, Cursor, Gemini CLI, and Qwen Code -import { existsSync } from "node:fs" -import { dirname, join } from "node:path" +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; // --- Project Root Resolution --- @@ -12,52 +12,55 @@ import { dirname, join } from "node:path" * (e.g. packages/i18n during a build) from creating state files * in the wrong location. */ -const MAX_DEPTH = 20 +const MAX_DEPTH = 20; export function resolveGitRoot(startDir: string): string { - let dir = startDir + let dir = startDir; for (let i = 0; i < MAX_DEPTH; i++) { - if (existsSync(join(dir, ".git"))) return dir - const parent = dirname(dir) - if (parent === dir) return startDir - dir = parent + if (existsSync(join(dir, ".git"))) return dir; + const parent = dirname(dir); + if (parent === dir) return startDir; + dir = parent; } - return startDir + return startDir; } // --- Vendor Detection --- -export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen" +export type Vendor = "claude" | "codex" | "cursor" | "gemini" | "qwen"; // --- Hook Input (unified) --- export interface HookInput { - prompt?: string - sessionId?: string - session_id?: string - hook_event_name?: string - cwd?: string - workspace_roots?: string[] + prompt?: string; + sessionId?: string; + session_id?: string; + hook_event_name?: string; + cwd?: string; + workspace_roots?: string[]; // Gemini: AfterAgent fields - prompt_response?: string - stop_hook_active?: boolean + prompt_response?: string; + stop_hook_active?: boolean; // Claude/Qwen: Stop fields - stopReason?: string + stopReason?: string; } // --- Hook Output Builders --- -export function makePromptOutput(vendor: Vendor, additionalContext: string): string { +export function makePromptOutput( + vendor: Vendor, + additionalContext: string, +): string { switch (vendor) { case "claude": - return JSON.stringify({ additionalContext }) + return JSON.stringify({ additionalContext }); case "codex": return JSON.stringify({ hookSpecificOutput: { hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "cursor": return JSON.stringify({ additionalContext, @@ -66,14 +69,14 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); case "gemini": return JSON.stringify({ hookSpecificOutput: { hookEventName: "BeforeAgent", additionalContext, }, - }) + }); case "qwen": // Qwen Code fork uses hookSpecificOutput (same as Codex) return JSON.stringify({ @@ -81,7 +84,7 @@ export function makePromptOutput(vendor: Vendor, additionalContext: string): str hookEventName: "UserPromptSubmit", additionalContext, }, - }) + }); } } @@ -91,22 +94,25 @@ export function makeBlockOutput(vendor: Vendor, reason: string): string { case "codex": case "cursor": case "qwen": - return JSON.stringify({ decision: "block", reason }) + return JSON.stringify({ decision: "block", reason }); case "gemini": // Gemini AfterAgent uses "deny" to reject response and force retry - return JSON.stringify({ decision: "deny", reason }) + return JSON.stringify({ decision: "deny", reason }); } } // --- PreToolUse Output Builder --- -export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, unknown>): string { +export function makePreToolOutput( + vendor: Vendor, + updatedInput: Record<string, unknown>, +): string { switch (vendor) { case "gemini": return JSON.stringify({ decision: "rewrite", tool_input: updatedInput, - }) + }); case "cursor": return JSON.stringify({ updated_input: updatedInput, @@ -114,24 +120,27 @@ export function makePreToolOutput(vendor: Vendor, updatedInput: Record<string, u hookEventName: "PreToolUse", updatedInput, }, - }) + }); case "claude": - case "codex": - case "qwen": return JSON.stringify({ hookSpecificOutput: { hookEventName: "PreToolUse", updatedInput, }, - }) + }); + case "codex": + case "qwen": + return JSON.stringify({ + updated_input: updatedInput, + }); } } // --- Shared Types --- export interface ModeState { - workflow: string - sessionId: string - activatedAt: string - reinforcementCount: number + workflow: string; + sessionId: string; + activatedAt: string; + reinforcementCount: number; } diff --git a/.qwen/settings.json b/.qwen/settings.json index f09641e..c2c04ac 100644 --- a/.qwen/settings.json +++ b/.qwen/settings.json @@ -43,5 +43,23 @@ ] } ] + }, + "mcpServers": { + "serena": { + "command": "uvx", + "args": [ + "--from", + "git+https://github.com/oraios/serena", + "serena", + "start-mcp-server", + "--context", + "agent", + "--project", + "." + ], + "env": { + "SERENA_LOG_LEVEL": "info" + } + } } }