From d98e091fc67022367b4ed03a92d50f8188912651 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:00:42 +0800 Subject: [PATCH 01/10] feat: smart provider failover --- assets/oh-my-opencode.schema.json | 514 +++++++++++++++++++++++- docs/features/smart-failover.md | 31 ++ docs/features/smart-failover.zh-CN.md | 37 ++ src/agents/sisyphus-junior.ts | 5 +- src/agents/types.ts | 3 +- src/agents/utils.ts | 37 +- src/config/schema.ts | 16 +- src/features/failover/diagnoser.ts | 54 +++ src/features/failover/resolver.ts | 24 ++ src/features/failover/status-manager.ts | 66 +++ src/features/failover/types.ts | 29 ++ src/hooks/index.ts | 1 + src/hooks/smart-failover/index.test.ts | 122 ++++++ src/hooks/smart-failover/index.ts | 243 +++++++++++ src/index.ts | 7 + src/plugin-handlers/config-handler.ts | 11 +- src/tools/delegate-task/tools.ts | 7 +- 17 files changed, 1174 insertions(+), 33 deletions(-) create mode 100644 docs/features/smart-failover.md create mode 100644 docs/features/smart-failover.zh-CN.md create mode 100644 src/features/failover/diagnoser.ts create mode 100644 src/features/failover/resolver.ts create mode 100644 src/features/failover/status-manager.ts create mode 100644 src/features/failover/types.ts create mode 100644 src/hooks/smart-failover/index.test.ts create mode 100644 src/hooks/smart-failover/index.ts diff --git a/assets/oh-my-opencode.schema.json b/assets/oh-my-opencode.schema.json index 8636df6a5c..769f3c33cc 100644 --- a/assets/oh-my-opencode.schema.json +++ b/assets/oh-my-opencode.schema.json @@ -8,6 +8,19 @@ "$schema": { "type": "string" }, + "model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, "disabled_mcps": { "type": "array", "items": { @@ -76,7 +89,8 @@ "delegate-task-retry", "prometheus-md-only", "start-work", - "atlas" + "atlas", + "smart-failover" ] } }, @@ -97,7 +111,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -223,7 +269,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -349,7 +427,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -475,7 +585,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -601,7 +743,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -727,7 +901,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -853,7 +1059,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -979,7 +1217,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1105,7 +1375,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1231,7 +1533,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1357,7 +1691,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1483,7 +1849,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1609,7 +2007,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -1742,7 +2172,39 @@ "type": "object", "properties": { "model": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } }, "variant": { "type": "string" @@ -2166,6 +2628,28 @@ "type": "boolean" } } + }, + "failover": { + "type": "object", + "properties": { + "enabled": { + "default": true, + "type": "boolean" + }, + "strategy": { + "default": "auto", + "type": "string", + "enum": [ + "auto", + "aggressive", + "conservative" + ] + }, + "cooldown_ms": { + "type": "number", + "minimum": 0 + } + } } } } \ No newline at end of file diff --git a/docs/features/smart-failover.md b/docs/features/smart-failover.md new file mode 100644 index 0000000000..b7f6701ce6 --- /dev/null +++ b/docs/features/smart-failover.md @@ -0,0 +1,31 @@ +# Smart Provider Failover + +## 1. Overview +In multi-model environments, providers often hit **429 (Rate Limits)**, **Insufficient Balance**, or **Quota Exhaustion**. +The Smart Failover system provides an automated detection and recovery mechanism, ensuring uninterrupted service by switching to healthy fallback models instantly. + +## 2. Key Features +- **Pipe Syntax (`|`)**: Minimalist fallback chain definitions. +- **Instant Failover**: Aborts OpenCode's internal retry loops to trigger immediate model swapping. +- **Error Diagnosis**: Parses `Retry-After` headers and vendor-specific error payloads. +- **Guardrails**: + - **Context Compatibility**: Skips fallbacks with insufficient context windows. + - **Half-Open Probation**: Tests cooling providers with a single probe before full recovery. + - **Memory Safety**: Automatic cleanup upon session deletion. + +## 3. Configuration +Modify the `model` field in your `oh-my-opencode.json`. + +### Example +```jsonc +{ + "model": "openai/gpt-5.2-codex | google/gemini-3-pro", + "failover": { + "strategy": "auto" + } +} +``` + +## 4. UI/UX +- **Notification**: A yellow toast appears: `⚠️ Switched to google/gemini-3-pro`. +- **Throttling**: Toasts are shown only once per session to prevent UI spam. diff --git a/docs/features/smart-failover.zh-CN.md b/docs/features/smart-failover.zh-CN.md new file mode 100644 index 0000000000..c6e71577bd --- /dev/null +++ b/docs/features/smart-failover.zh-CN.md @@ -0,0 +1,37 @@ +# 智能供应商故障切换 (Smart Provider Failover) + +## 1. 简介 +在多模型协作环境下,API 供应商经常会遇到 **429 (频率限制)**、**余额不足** 或 **订阅配额耗尽** 的情况。 +Smart Failover 系统为 `oh-my-opencode` 引入了一套自动化的故障检测与恢复机制。它能确保在主模型不可用时,系统瞬间接管请求并切换到备用模型,实现“永不掉线”的 AI 助手体验。 + +## 2. 核心特性 +- **管道符配置 (`|`)**: 极简的备选链定义方式。 +- **秒级无感切换**: 自动强行终止 OpenCode 原生的卡顿重试循环,秒切备用线路。 +- **工业级错误诊断**: 智能解析 `Retry-After` 头部及各种供应商特有的错误语义。 +- **安全防护栏**: + - **上下文窗口对齐**: 自动跳过窗口过小的备用模型。 + - **半开探测 (Half-Open)**: 冷却结束后先进行单次探测,防止盲目恢复。 + - **内存管理**: 随会话销毁自动清理缓存,无内存泄露风险。 + +## 3. 使用方法 +在 `oh-my-opencode.json` 中配置 `model` 字段即可。 + +### 示例配置 +```jsonc +{ + "model": "openai/gpt-5.2-codex | google/gemini-3-pro", + "failover": { + "strategy": "auto" // 可选: auto (默认), aggressive, conservative + } +} +``` + +## 4. 故障状态说明 +- **HEALTHY (健康)**: 正常使用。 +- **COOLING (冷却中)**: 触发 429 或 5xx 错误,根据指数退避算法进入等待期。 +- **LOCKED (锁定)**: 触发余额不足或配额耗尽,除非重启或修改配置,否则不再尝试。 +- **PROBATION (试用期)**: 冷却结束,允许一个请求尝试“探路”,成功则转为健康。 + +## 5. UI 交互 +- **故障提示**: 切换时会弹出黄色 Toast,内容如 `⚠️ Switched to google/gemini-3-pro`。 +- **静默机制**: 每个会话仅提示一次,后续切换保持静默,不干扰工作。 diff --git a/src/agents/sisyphus-junior.ts b/src/agents/sisyphus-junior.ts index 45b4102ddd..a8e212d904 100644 --- a/src/agents/sisyphus-junior.ts +++ b/src/agents/sisyphus-junior.ts @@ -91,6 +91,7 @@ export function createSisyphusJuniorAgentWithOverrides( } const model = override?.model ?? systemDefaultModel ?? SISYPHUS_JUNIOR_DEFAULTS.model + const primaryModel = Array.isArray(model) ? model[0] : model const temperature = override?.temperature ?? SISYPHUS_JUNIOR_DEFAULTS.temperature const promptAppend = override?.prompt_append @@ -111,7 +112,7 @@ export function createSisyphusJuniorAgentWithOverrides( description: override?.description ?? "Sisyphus-Junior - Focused task executor. Same discipline, no delegation.", mode: "subagent" as const, - model, + model: primaryModel, temperature, maxTokens: 64000, prompt, @@ -123,7 +124,7 @@ export function createSisyphusJuniorAgentWithOverrides( base.top_p = override.top_p } - if (isGptModel(model)) { + if (isGptModel(primaryModel)) { return { ...base, reasoningEffort: "medium" } as AgentConfig } diff --git a/src/agents/types.ts b/src/agents/types.ts index f22683a277..1b89873c57 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -72,7 +72,8 @@ export type OverridableAgentName = export type AgentName = BuiltinAgentName -export type AgentOverrideConfig = Partial & { +export type AgentOverrideConfig = Partial> & { + model?: string | string[] prompt_append?: string variant?: string } diff --git a/src/agents/utils.ts b/src/agents/utils.ts index 6bc3c5d86a..56a897bdd9 100644 --- a/src/agents/utils.ts +++ b/src/agents/utils.ts @@ -47,21 +47,24 @@ function isFactory(source: AgentSource): source is AgentFactory { export function buildAgent( source: AgentSource, - model: string, + model: string | string[], categories?: CategoriesConfig, gitMasterConfig?: GitMasterConfig ): AgentConfig { - const base = isFactory(source) ? source(model) : source const categoryConfigs: Record = categories ? { ...DEFAULT_CATEGORIES, ...categories } : DEFAULT_CATEGORIES + const primaryModel = Array.isArray(model) ? model[0] : model + const base = isFactory(source) ? source(primaryModel) : source + const agentWithCategory = base as AgentConfig & { category?: string; skills?: string[]; variant?: string } if (agentWithCategory.category) { const categoryConfig = categoryConfigs[agentWithCategory.category] if (categoryConfig) { if (!base.model) { - base.model = categoryConfig.model + const categoryModel = categoryConfig.model + base.model = Array.isArray(categoryModel) ? categoryModel[0] : categoryModel } if (base.temperature === undefined && categoryConfig.temperature !== undefined) { base.temperature = categoryConfig.temperature @@ -121,9 +124,25 @@ function mergeAgentConfig( base: AgentConfig, override: AgentOverrideConfig ): AgentConfig { - const { prompt_append, ...rest } = override + const { prompt_append, model, ...rest } = override + + let sanitizedModel: string | undefined + if (model) { + if (Array.isArray(model)) { + sanitizedModel = model[0] + } else if (model.includes("|")) { + sanitizedModel = model.split("|")[0].trim() + } else { + sanitizedModel = model + } + } + const merged = deepMerge(base, rest as Partial) + if (sanitizedModel) { + merged.model = sanitizedModel + } + if (prompt_append && merged.prompt) { merged.prompt = merged.prompt + "\n" + prompt_append } @@ -171,8 +190,9 @@ export function createBuiltinAgents( const override = agentOverrides[agentName] const model = override?.model ?? systemDefaultModel + const primaryModel = Array.isArray(model) ? model[0] : model - let config = buildAgent(source, model, mergedCategories, gitMasterConfig) + let config = buildAgent(source, primaryModel, mergedCategories, gitMasterConfig) if (agentName === "librarian" && directory && config.prompt) { const envContext = createEnvContext() @@ -198,9 +218,10 @@ export function createBuiltinAgents( if (!disabledAgents.includes("Sisyphus")) { const sisyphusOverride = agentOverrides["Sisyphus"] const sisyphusModel = sisyphusOverride?.model ?? systemDefaultModel + const sisyphusPrimaryModel = Array.isArray(sisyphusModel) ? sisyphusModel[0] : sisyphusModel let sisyphusConfig = createSisyphusAgent( - sisyphusModel, + sisyphusPrimaryModel, availableAgents, undefined, availableSkills, @@ -222,8 +243,10 @@ export function createBuiltinAgents( if (!disabledAgents.includes("Atlas")) { const orchestratorOverride = agentOverrides["Atlas"] const orchestratorModel = orchestratorOverride?.model ?? systemDefaultModel + const orchestratorPrimaryModel = Array.isArray(orchestratorModel) ? orchestratorModel[0] : orchestratorModel + let orchestratorConfig = createAtlasAgent({ - model: orchestratorModel, + model: orchestratorPrimaryModel, availableAgents, availableSkills, userCategories: categories, diff --git a/src/config/schema.ts b/src/config/schema.ts index 2fdde902c9..8b2009ea82 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -84,6 +84,7 @@ export const HookNameSchema = z.enum([ "prometheus-md-only", "start-work", "atlas", + "smart-failover", ]) export const BuiltinCommandNameSchema = z.enum([ @@ -91,9 +92,16 @@ export const BuiltinCommandNameSchema = z.enum([ "start-work", ]) +export const FailoverConfigSchema = z.object({ + enabled: z.boolean().default(true), + strategy: z.enum(["auto", "aggressive", "conservative"]).default("auto"), + cooldown_ms: z.number().min(0).optional(), +}) + export const AgentOverrideConfigSchema = z.object({ /** @deprecated Use `category` instead. Model is inherited from category defaults. */ - model: z.string().optional(), + model: z.union([z.string(), z.array(z.string())]).optional(), + failover: FailoverConfigSchema.optional(), variant: z.string().optional(), /** Category name to inherit model and other settings from CategoryConfig */ category: z.string().optional(), @@ -148,7 +156,8 @@ export const SisyphusAgentConfigSchema = z.object({ }) export const CategoryConfigSchema = z.object({ - model: z.string().optional(), + model: z.union([z.string(), z.array(z.string())]).optional(), + failover: FailoverConfigSchema.optional(), variant: z.string().optional(), temperature: z.number().min(0).max(2).optional(), top_p: z.number().min(0).max(1).optional(), @@ -296,6 +305,7 @@ export const GitMasterConfigSchema = z.object({ export const OhMyOpenCodeConfigSchema = z.object({ $schema: z.string().optional(), + model: z.union([z.string(), z.array(z.string())]).optional(), disabled_mcps: z.array(AnyMcpNameSchema).optional(), disabled_agents: z.array(BuiltinAgentNameSchema).optional(), disabled_skills: z.array(BuiltinSkillNameSchema).optional(), @@ -313,6 +323,7 @@ export const OhMyOpenCodeConfigSchema = z.object({ background_task: BackgroundTaskConfigSchema.optional(), notification: NotificationConfigSchema.optional(), git_master: GitMasterConfigSchema.optional(), + failover: FailoverConfigSchema.optional(), }) export type OhMyOpenCodeConfig = z.infer @@ -332,6 +343,7 @@ export type SkillDefinition = z.infer export type RalphLoopConfig = z.infer export type NotificationConfig = z.infer export type CategoryConfig = z.infer +export type FailoverConfig = z.infer export type CategoriesConfig = z.infer export type BuiltinCategoryName = z.infer export type GitMasterConfig = z.infer diff --git a/src/features/failover/diagnoser.ts b/src/features/failover/diagnoser.ts new file mode 100644 index 0000000000..c51a3b0ce7 --- /dev/null +++ b/src/features/failover/diagnoser.ts @@ -0,0 +1,54 @@ +import type { DiagnoseResult, RecoveryAction } from "./types" + +const PATTERNS: Array<{ regex: RegExp; action: RecoveryAction; type: string }> = [ + { regex: /insufficient balance/i, action: "LOCKED", type: "balance" }, + { regex: /usage limit reached/i, action: "LOCKED", type: "balance" }, + { regex: /quota exceeded/i, action: "COOLING", type: "quota" }, + { regex: /rate limit/i, action: "COOLING", type: "rate_limit" }, + { regex: /429/i, action: "COOLING", type: "rate_limit" }, + { regex: /overloaded/i, action: "COOLING", type: "overloaded" }, + { regex: /503/i, action: "COOLING", type: "server_error" }, + { regex: /502/i, action: "COOLING", type: "server_error" }, + { regex: /500/i, action: "COOLING", type: "server_error" }, + { regex: /unavailable/i, action: "COOLING", type: "availability" }, + { regex: /not found/i, action: "COOLING", type: "availability" }, + { regex: /does not exist/i, action: "COOLING", type: "availability" }, + { regex: /unsupported/i, action: "COOLING", type: "availability" }, + { regex: /context length/i, action: "SKIP", type: "context_length" }, + { regex: /maximum context/i, action: "SKIP", type: "context_length" }, + { regex: /token limit/i, action: "SKIP", type: "context_length" }, +] + +export class ErrorDiagnoser { + static diagnose(error: unknown, headers?: Record): DiagnoseResult { + const errorStr = String(error) + + if (headers) { + const retryAfter = headers["retry-after"] || headers["x-ratelimit-reset"] + if (retryAfter) { + const seconds = parseInt(retryAfter, 10) + if (!isNaN(seconds)) { + return { + action: "COOLING", + reason: `Retry-After header: ${seconds}s`, + cooldownMs: seconds * 1000 + } + } + } + } + + for (const pattern of PATTERNS) { + if (pattern.regex.test(errorStr)) { + return { + action: pattern.action, + reason: `Matched pattern: ${pattern.type}` + } + } + } + + return { + action: "RETRY", + reason: "Unknown error, default retry" + } + } +} diff --git a/src/features/failover/resolver.ts b/src/features/failover/resolver.ts new file mode 100644 index 0000000000..e8c8fbd9f5 --- /dev/null +++ b/src/features/failover/resolver.ts @@ -0,0 +1,24 @@ +import type { ModelChain } from "./types" + +export function resolveModelChain(modelConfig?: string | string[]): ModelChain | null { + if (!modelConfig) return null + + let models: string[] = [] + + if (Array.isArray(modelConfig)) { + models = modelConfig + } else if (typeof modelConfig === "string") { + if (modelConfig.includes("|")) { + models = modelConfig.split("|").map(m => m.trim()).filter(Boolean) + } else { + models = [modelConfig] + } + } + + if (models.length === 0) return null + + return { + primary: models[0], + fallbacks: models.slice(1) + } +} diff --git a/src/features/failover/status-manager.ts b/src/features/failover/status-manager.ts new file mode 100644 index 0000000000..3c2c643c3b --- /dev/null +++ b/src/features/failover/status-manager.ts @@ -0,0 +1,66 @@ +import type { ProviderState, ProviderStatus } from "./types" + +export class ProviderStatusManager { + private static instance: ProviderStatusManager + private states = new Map() + + private constructor() {} + + static getInstance(): ProviderStatusManager { + if (!ProviderStatusManager.instance) { + ProviderStatusManager.instance = new ProviderStatusManager() + } + return ProviderStatusManager.instance + } + + getState(model: string): ProviderState | undefined { + return this.states.get(model) + } + + getStatus(model: string): ProviderStatus { + const state = this.states.get(model) + if (!state) return "HEALTHY" + + if (state.status === "COOLING") { + if (Date.now() >= state.resumeAt) { + return "PROBATION" + } + return "COOLING" + } + + return state.status + } + + isAvailable(model: string): boolean { + const status = this.getStatus(model) + return status === "HEALTHY" || status === "PROBATION" + } + + markCooling(model: string, durationMs: number, reason: string) { + const current = this.states.get(model) + this.states.set(model, { + status: "COOLING", + resumeAt: Date.now() + durationMs, + reason, + retryCount: (current?.retryCount ?? 0) + 1 + }) + } + + markLocked(model: string, reason: string) { + const current = this.states.get(model) + this.states.set(model, { + status: "LOCKED", + resumeAt: Infinity, + reason, + retryCount: (current?.retryCount ?? 0) + 1 + }) + } + + markHealthy(model: string) { + this.states.delete(model) + } + + reset() { + this.states.clear() + } +} diff --git a/src/features/failover/types.ts b/src/features/failover/types.ts new file mode 100644 index 0000000000..7e178c1737 --- /dev/null +++ b/src/features/failover/types.ts @@ -0,0 +1,29 @@ +export type FailoverStrategy = 'auto' | 'aggressive' | 'conservative'; + +export type ProviderStatus = 'HEALTHY' | 'COOLING' | 'LOCKED' | 'PROBATION'; + +export interface ProviderState { + status: ProviderStatus; + resumeAt: number; + reason?: string; + retryCount: number; +} + +export interface FailoverConfig { + enabled: boolean; + strategy: FailoverStrategy; + cooldown?: number; +} + +export type RecoveryAction = 'COOLING' | 'LOCKED' | 'RETRY' | 'SKIP'; + +export interface DiagnoseResult { + action: RecoveryAction; + reason: string; + cooldownMs?: number; +} + +export interface ModelChain { + primary: string; + fallbacks: string[]; +} diff --git a/src/hooks/index.ts b/src/hooks/index.ts index 48ee884cbb..8d5381ee13 100644 --- a/src/hooks/index.ts +++ b/src/hooks/index.ts @@ -30,3 +30,4 @@ export { createTaskResumeInfoHook } from "./task-resume-info"; export { createStartWorkHook } from "./start-work"; export { createAtlasHook } from "./atlas"; export { createDelegateTaskRetryHook } from "./delegate-task-retry"; +export { createSmartFailoverHook } from "./smart-failover"; diff --git a/src/hooks/smart-failover/index.test.ts b/src/hooks/smart-failover/index.test.ts new file mode 100644 index 0000000000..93035c63fb --- /dev/null +++ b/src/hooks/smart-failover/index.test.ts @@ -0,0 +1,122 @@ +import { describe, expect, test, mock, beforeEach } from "bun:test" +import { createSmartFailoverHook } from "./index" +import { ProviderStatusManager } from "../../features/failover/status-manager" +import type { PluginInput } from "@opencode-ai/plugin" +import type { OhMyOpenCodeConfig } from "../../config" +import type { ModelCacheState } from "../../plugin-state" + +describe("smart-failover hook", () => { + let ctx: PluginInput + let config: OhMyOpenCodeConfig + let statusManager: ProviderStatusManager + let modelCacheState: ModelCacheState + + beforeEach(() => { + ProviderStatusManager.getInstance().reset() + statusManager = ProviderStatusManager.getInstance() + + ctx = { + client: { + tui: { + showToast: mock(() => Promise.resolve()) + }, + session: { + abort: mock(() => Promise.resolve()), + prompt: mock(() => Promise.resolve()) + } + } + } as unknown as PluginInput + + config = { + model: "primary/model", + agents: { + Sisyphus: { + model: "primary/model | fallback/model" + } + } + } + + modelCacheState = { + modelContextLimitsCache: new Map(), + anthropicContext1MEnabled: false + } + }) + + test("chat.message should allow healthy primary", async () => { + const hook = createSmartFailoverHook(ctx, config, modelCacheState) + const output = { message: {} } as any + + await hook["chat.message"]( + { sessionID: "ses-1", agent: "Sisyphus", model: { providerID: "primary", modelID: "model" } }, + output + ) + + expect(output.message.model).toBeUndefined() + }) + + test("chat.message should swap cooling primary", async () => { + statusManager.markCooling("primary/model", 10000, "test") + + const hook = createSmartFailoverHook(ctx, config, modelCacheState) + const output = { message: {} } as any + + await hook["chat.message"]( + { sessionID: "ses-1", agent: "Sisyphus", model: { providerID: "primary", modelID: "model" } }, + output + ) + + expect(output.message.model).toEqual({ providerID: "fallback", modelID: "model" }) + expect(ctx.client.tui.showToast).toHaveBeenCalled() + }) + + test("session.error should mark model cooling with backoff", async () => { + const hook = createSmartFailoverHook(ctx, config, modelCacheState) + const output = { message: {} } as any + + await hook["chat.message"]( + { sessionID: "ses-1", agent: "Sisyphus", model: { providerID: "primary", modelID: "model" } }, + output + ) + + await hook.event({ + event: { + type: "session.error", + properties: { + sessionID: "ses-1", + error: "Rate limit reached" + } + } + }) + + const state1 = statusManager.getState("primary/model") + expect(state1?.status).toBe("COOLING") + expect(state1?.retryCount).toBe(1) + + expect(ctx.client.session.abort).toHaveBeenCalled() + }) + + test("session.idle should recover probation model", async () => { + statusManager.markCooling("primary/model", -1000, "test") + + const hook = createSmartFailoverHook(ctx, config, modelCacheState) + const output = { message: {} } as any + + await hook["chat.message"]( + { sessionID: "ses-1", agent: "Sisyphus", model: { providerID: "primary", modelID: "model" } }, + output + ) + + expect(statusManager.getStatus("primary/model")).toBe("PROBATION") + + await hook.event({ + event: { + type: "session.idle", + properties: { + sessionID: "ses-1" + } + } + }) + + expect(statusManager.getStatus("primary/model")).toBe("HEALTHY") + }) +}) diff --git a/src/hooks/smart-failover/index.ts b/src/hooks/smart-failover/index.ts new file mode 100644 index 0000000000..b3e1da7ac9 --- /dev/null +++ b/src/hooks/smart-failover/index.ts @@ -0,0 +1,243 @@ +import type { PluginInput } from "@opencode-ai/plugin" +import type { OhMyOpenCodeConfig } from "../../config" +import { ProviderStatusManager } from "../../features/failover/status-manager" +import { ErrorDiagnoser } from "../../features/failover/diagnoser" +import { resolveModelChain } from "../../features/failover/resolver" +import { log } from "../../shared" +import type { ModelCacheState } from "../../plugin-state" + +// Store both model key and agent name for context during events +const sessionContext = new Map() +const toastedSessions = new Set() +const pendingFailovers = new Set() + +export function createSmartFailoverHook( + ctx: PluginInput, + config: OhMyOpenCodeConfig, + modelCacheState: ModelCacheState +) { + const statusManager = ProviderStatusManager.getInstance() + + const findFallback = (agentName: string, currentModelKey: string, sessionID: string) => { + const getModelConfig = (agent: string) => { + // @ts-ignore + return config.agents?.[agent]?.model + } + + let modelConfig = getModelConfig(agentName) ?? config.model + + if (!modelConfig && agentName !== "Sisyphus") { + modelConfig = getModelConfig("Sisyphus") + } + + const chain = resolveModelChain(modelConfig as string | string[]) + if (!chain) return undefined + + return chain.fallbacks.find(m => { + if (!statusManager.isAvailable(m)) return false + + const primaryLimit = modelCacheState.modelContextLimitsCache.get(currentModelKey) + const fallbackLimit = modelCacheState.modelContextLimitsCache.get(m) + + if (primaryLimit && fallbackLimit && fallbackLimit < primaryLimit * 0.5) { + log(`[SmartFailover] Skipping fallback ${m} due to small context window (${fallbackLimit} < ${primaryLimit})`, { sessionID }) + return false + } + return true + }) + } + + const performFailover = async (sessionID: string, currentModelKey: string, agent: string, reason: string) => { + if (pendingFailovers.has(sessionID)) return false + pendingFailovers.add(sessionID) + + try { + const fallback = findFallback(agent, currentModelKey, sessionID) + + if (fallback) { + const [providerID, modelID] = fallback.split("/") + if (providerID && modelID) { + log(`[SmartFailover] Failover triggered: ${currentModelKey} -> ${fallback}. Reason: ${reason}`, { sessionID }) + + if (!toastedSessions.has(sessionID)) { + setTimeout(() => { + ctx.client.tui.showToast({ + body: { + title: "Failover Active", + message: `⚠️ ${currentModelKey} unavailable. Switched to ${fallback}.`, + variant: "warning", + duration: 5000 + } + }).catch(() => {}) + }, 1500) + toastedSessions.add(sessionID) + } + + sessionContext.set(sessionID, { modelKey: fallback, agent }) + + await ctx.client.session.abort({ path: { id: sessionID } }).catch(() => {}) + + let retryAttempt = 0 + const maxRetries = 5 + const checkAndPrompt = async () => { + try { + await ctx.client.session.prompt({ + path: { id: sessionID }, + body: { + model: { providerID, modelID }, + agent, + parts: [{ type: "text", text: "System: Previous model failed. Please continue from exactly where you left off." }] + } + }) + pendingFailovers.delete(sessionID) + } catch (e: any) { + if (e.message?.includes("busy") && retryAttempt < maxRetries) { + retryAttempt++ + setTimeout(checkAndPrompt, 300) + } else { + log("[SmartFailover] Retry prompt failed", e) + pendingFailovers.delete(sessionID) + } + } + } + + setTimeout(checkAndPrompt, 500) + return true + } + } + pendingFailovers.delete(sessionID) + return false + } catch (e) { + pendingFailovers.delete(sessionID) + throw e + } + } + + return { + "chat.message": async ( + input: { + sessionID: string + agent?: string + model?: { providerID: string; modelID: string } + }, + output: { + message: Record + } + ) => { + const currentModelKey = input.model + ? `${input.model.providerID}/${input.model.modelID}` + : undefined + + if (!currentModelKey) return + + const agentName = input.agent || "Sisyphus" + sessionContext.set(input.sessionID, { modelKey: currentModelKey, agent: agentName }) + + if (statusManager.isAvailable(currentModelKey)) { + return + } + + const fallback = findFallback(agentName, currentModelKey, input.sessionID) + + if (fallback) { + const [providerID, modelID] = fallback.split("/") + if (providerID && modelID && output.message) { + output.message.model = { providerID, modelID } + + sessionContext.set(input.sessionID, { modelKey: fallback, agent: agentName }) + + log(`[SmartFailover] Swapped ${currentModelKey} -> ${fallback}`, { sessionID: input.sessionID }) + + if (!toastedSessions.has(input.sessionID)) { + setTimeout(() => { + ctx.client.tui.showToast({ + body: { + title: "Failover Active", + message: `⚠️ ${currentModelKey} unavailable. Switched to ${fallback}.`, + variant: "warning", + duration: 10000 + } + }).catch(() => {}) + }, 1500) + toastedSessions.add(input.sessionID) + } + } + } + }, + + "event": async (input: { event: { type: string; properties?: unknown } }) => { + if (input.event.type === "session.deleted") { + const id = (input.event.properties as any)?.info?.id + if (id) { + sessionContext.delete(id) + toastedSessions.delete(id) + pendingFailovers.delete(id) + } + } + + if (input.event.type === "session.idle") { + const props = input.event.properties as { sessionID?: string } + const sessionID = props.sessionID + if (sessionID) { + const sessionCtx = sessionContext.get(sessionID) + if (sessionCtx && statusManager.getStatus(sessionCtx.modelKey) === "PROBATION") { + statusManager.markHealthy(sessionCtx.modelKey) + log(`[SmartFailover] ${sessionCtx.modelKey} recovered from PROBATION`, { sessionID }) + } + } + } + + if (input.event.type === "session.status") { + const props = input.event.properties as { status: { type: string; message?: string }, sessionID: string } + if (props.status.type === "retry") { + const sessionID = props.sessionID + const sessionCtx = sessionContext.get(sessionID) + + if (sessionCtx && statusManager.getStatus(sessionCtx.modelKey) !== "COOLING") { + const reason = props.status.message || "Retry loop detected" + statusManager.markCooling(sessionCtx.modelKey, 300000, reason) + await performFailover(sessionID, sessionCtx.modelKey, sessionCtx.agent, reason) + } + } + } + + if (input.event.type === "session.error") { + const props = input.event.properties as { error?: unknown; sessionID?: string } + const sessionID = props.sessionID + if (!sessionID) return + + if (String(props.error).includes("AbortError") || String(props.error).includes("Aborted")) return + + const sessionCtx = sessionContext.get(sessionID) + if (!sessionCtx) return + + const result = ErrorDiagnoser.diagnose(props.error) + + if (result.action === "COOLING") { + + const currentState = statusManager.getState(sessionCtx.modelKey) + const retryCount = currentState?.retryCount ?? 0 + const backoffMultiplier = Math.pow(2, Math.min(retryCount, 14)) + const duration = (result.cooldownMs ?? 300000) * backoffMultiplier + + statusManager.markCooling(sessionCtx.modelKey, duration, result.reason) + + await performFailover(sessionID, sessionCtx.modelKey, sessionCtx.agent, result.reason) + + } else if (result.action === "LOCKED") { + statusManager.markLocked(sessionCtx.modelKey, result.reason) + await performFailover(sessionID, sessionCtx.modelKey, sessionCtx.agent, result.reason) + + ctx.client.tui.showToast({ + body: { + title: "Provider Locked", + message: `🛑 ${sessionCtx.modelKey} locked (Balance/Quota). Update config to reset.`, + variant: "error", + duration: 6000 + } + }).catch(() => {}) + } + } + } + } +} diff --git a/src/index.ts b/src/index.ts index c75b427854..8314b3b425 100644 --- a/src/index.ts +++ b/src/index.ts @@ -31,6 +31,7 @@ import { createStartWorkHook, createAtlasHook, createPrometheusMdOnlyHook, + createSmartFailoverHook, } from "./hooks"; import { contextCollector, @@ -202,6 +203,10 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { ? createAtlasHook(ctx) : null; + const smartFailover = isHookEnabled("smart-failover") + ? createSmartFailoverHook(ctx, pluginConfig, modelCacheState) + : null; + const prometheusMdOnly = isHookEnabled("prometheus-md-only") ? createPrometheusMdOnlyHook(ctx) : null; @@ -326,6 +331,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { await claudeCodeHooks["chat.message"]?.(input, output); await autoSlashCommand?.["chat.message"]?.(input, output); await startWork?.["chat.message"]?.(input, output); + await smartFailover?.["chat.message"]?.(input, output); if (ralphLoop) { const parts = ( @@ -412,6 +418,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { await interactiveBashSession?.event(input); await ralphLoop?.event(input); await atlasHook?.handler(input); + await smartFailover?.event(input); const { event } = input; const props = event.properties as Record | undefined; diff --git a/src/plugin-handlers/config-handler.ts b/src/plugin-handlers/config-handler.ts index 3ba18add2f..17d3804c5c 100644 --- a/src/plugin-handlers/config-handler.ts +++ b/src/plugin-handlers/config-handler.ts @@ -101,7 +101,10 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { log(`Plugin load errors`, { errors: pluginComponents.errors }); } - if (!(config.model as string | undefined)?.trim()) { + const configModel = config.model as string | string[] | undefined + const systemDefaultModel = Array.isArray(configModel) ? configModel[0] : configModel + + if (!systemDefaultModel?.trim()) { const paths = getOpenCodeConfigPaths({ binary: "opencode", version: null }) throw new Error( 'oh-my-opencode requires a default model.\n\n' + @@ -120,7 +123,7 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { migratedDisabledAgents, pluginConfig.agents, ctx.directory, - config.model as string | undefined, + systemDefaultModel, pluginConfig.categories, pluginConfig.git_master ); @@ -174,7 +177,7 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { agentConfig["Sisyphus-Junior"] = createSisyphusJuniorAgentWithOverrides( pluginConfig.agents?.["Sisyphus-Junior"], - config.model as string | undefined + systemDefaultModel ); if (builderEnabled) { @@ -205,7 +208,7 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { pluginConfig.agents?.["Prometheus (Planner)"] as | (Record & { category?: string; model?: string }) | undefined; - const defaultModel = config.model as string | undefined; + const defaultModel = systemDefaultModel; // Resolve full category config (model, temperature, top_p, tools, etc.) // Apply all category properties when category is specified, but explicit diff --git a/src/tools/delegate-task/tools.ts b/src/tools/delegate-task/tools.ts index 854d1dacd8..69add39427 100644 --- a/src/tools/delegate-task/tools.ts +++ b/src/tools/delegate-task/tools.ts @@ -126,9 +126,12 @@ export function resolveCategoryConfig( // Model priority for categories: user override > category default > system default // Categories have explicit models - no inheritance from parent session + const userModel = userConfig?.model + const inheritedModelVal = defaultConfig?.model + const model = resolveModel({ - userModel: userConfig?.model, - inheritedModel: defaultConfig?.model, // Category's built-in model takes precedence over system default + userModel: Array.isArray(userModel) ? userModel[0] : userModel, + inheritedModel: Array.isArray(inheritedModelVal) ? inheritedModelVal[0] : inheritedModelVal, // Category's built-in model takes precedence over system default systemDefault: systemDefaultModel, }) const config: CategoryConfig = { From e44a4c0de22cb26b608dfe8bc445f5ce5a149be0 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:11:41 +0800 Subject: [PATCH 02/10] docs: clarify failover strategies and model array config --- docs/features/smart-failover.md | 27 ++++++++++++++++++++++++- docs/features/smart-failover.zh-CN.md | 29 +++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/docs/features/smart-failover.md b/docs/features/smart-failover.md index b7f6701ce6..6f29deb528 100644 --- a/docs/features/smart-failover.md +++ b/docs/features/smart-failover.md @@ -14,7 +14,15 @@ The Smart Failover system provides an automated detection and recovery mechanism - **Memory Safety**: Automatic cleanup upon session deletion. ## 3. Configuration -Modify the `model` field in your `oh-my-opencode.json`. +Smart Failover is enabled by defining a fallback chain in `model`. + +### 3.1 Model Fallback Chain +You can define the fallback chain using either: + +- **Pipe syntax** (string) +- **Array syntax** (string[]) + +Both forms are equivalent: the first entry is the primary model, and the rest are fallbacks. ### Example ```jsonc @@ -26,6 +34,23 @@ Modify the `model` field in your `oh-my-opencode.json`. } ``` +### Array Example +```jsonc +{ + "model": ["openai/gpt-5.2-codex", "google/gemini-3-pro"] +} +``` + +`model` can also be configured per-agent (e.g. `agents.Sisyphus.model`) and in category configs. Those locations also accept either pipe syntax or an array. + +### 3.2 Failover Strategy +`failover.strategy` currently does not change runtime behavior. It is accepted by the config schema for forward-compatibility. + +Current behavior matches an “auto” style flow: +- Detects retry loops and certain provider errors (e.g. 429 / 5xx, quota/rate-limit signals). +- Moves providers into COOLING with exponential backoff, then PROBATION, and finally HEALTHY after a successful probe. +- Locks providers on balance/quota exhaustion signals and avoids them until reset. + ## 4. UI/UX - **Notification**: A yellow toast appears: `⚠️ Switched to google/gemini-3-pro`. - **Throttling**: Toasts are shown only once per session to prevent UI spam. diff --git a/docs/features/smart-failover.zh-CN.md b/docs/features/smart-failover.zh-CN.md index c6e71577bd..befbe3c6a6 100644 --- a/docs/features/smart-failover.zh-CN.md +++ b/docs/features/smart-failover.zh-CN.md @@ -14,18 +14,43 @@ Smart Failover 系统为 `oh-my-opencode` 引入了一套自动化的故障检 - **内存管理**: 随会话销毁自动清理缓存,无内存泄露风险。 ## 3. 使用方法 -在 `oh-my-opencode.json` 中配置 `model` 字段即可。 +Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用。 + +### 3.1 模型备用链写法 +支持两种等价写法: + +- **管道符写法**(string) +- **数组写法**(string[]) + +两者语义一致:第一个是主模型,后续是备用模型。 ### 示例配置 ```jsonc { "model": "openai/gpt-5.2-codex | google/gemini-3-pro", "failover": { - "strategy": "auto" // 可选: auto (默认), aggressive, conservative + "strategy": "auto" } } ``` +### 数组示例 +```jsonc +{ + "model": ["openai/gpt-5.2-codex", "google/gemini-3-pro"] +} +``` + +`model` 也可以写在单个 agent(例如 `agents.Sisyphus.model`)或 category 配置里,上述两种写法同样支持。 + +### 3.2 策略说明 +`failover.strategy` 目前不会改变运行时行为,仅作为配置 schema 的前向兼容字段保留(后续可用于区分更激进/更保守的恢复策略)。 + +当前行为等同于 “auto 风格”: +- 识别 retry loop 与部分供应商错误(如 429 / 5xx、配额/限流信号)。 +- 进入 COOLING 并做指数退避,冷却结束进入 PROBATION,探测成功后恢复 HEALTHY。 +- 余额不足/配额耗尽等信号会进入 LOCKED,并避免继续使用直到重置。 + ## 4. 故障状态说明 - **HEALTHY (健康)**: 正常使用。 - **COOLING (冷却中)**: 触发 429 或 5xx 错误,根据指数退避算法进入等待期。 From b71028a1e8c0bca536a334a0593b21c14db38e03 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:36:30 +0800 Subject: [PATCH 03/10] refactor(config): remove unused failover config --- assets/oh-my-opencode.schema.json | 330 ------------------------------ src/config/schema.ts | 10 - src/features/failover/types.ts | 8 - 3 files changed, 348 deletions(-) diff --git a/assets/oh-my-opencode.schema.json b/assets/oh-my-opencode.schema.json index 769f3c33cc..b4dbf1bc17 100644 --- a/assets/oh-my-opencode.schema.json +++ b/assets/oh-my-opencode.schema.json @@ -123,28 +123,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -281,28 +259,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -439,28 +395,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -597,28 +531,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -755,28 +667,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -913,28 +803,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1071,28 +939,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1229,28 +1075,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1387,28 +1211,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1545,28 +1347,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1703,28 +1483,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -1861,28 +1619,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -2019,28 +1755,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -2184,28 +1898,6 @@ } ] }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } - }, "variant": { "type": "string" }, @@ -2628,28 +2320,6 @@ "type": "boolean" } } - }, - "failover": { - "type": "object", - "properties": { - "enabled": { - "default": true, - "type": "boolean" - }, - "strategy": { - "default": "auto", - "type": "string", - "enum": [ - "auto", - "aggressive", - "conservative" - ] - }, - "cooldown_ms": { - "type": "number", - "minimum": 0 - } - } } } } \ No newline at end of file diff --git a/src/config/schema.ts b/src/config/schema.ts index 8b2009ea82..b4caff83cc 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -92,16 +92,9 @@ export const BuiltinCommandNameSchema = z.enum([ "start-work", ]) -export const FailoverConfigSchema = z.object({ - enabled: z.boolean().default(true), - strategy: z.enum(["auto", "aggressive", "conservative"]).default("auto"), - cooldown_ms: z.number().min(0).optional(), -}) - export const AgentOverrideConfigSchema = z.object({ /** @deprecated Use `category` instead. Model is inherited from category defaults. */ model: z.union([z.string(), z.array(z.string())]).optional(), - failover: FailoverConfigSchema.optional(), variant: z.string().optional(), /** Category name to inherit model and other settings from CategoryConfig */ category: z.string().optional(), @@ -157,7 +150,6 @@ export const SisyphusAgentConfigSchema = z.object({ export const CategoryConfigSchema = z.object({ model: z.union([z.string(), z.array(z.string())]).optional(), - failover: FailoverConfigSchema.optional(), variant: z.string().optional(), temperature: z.number().min(0).max(2).optional(), top_p: z.number().min(0).max(1).optional(), @@ -323,7 +315,6 @@ export const OhMyOpenCodeConfigSchema = z.object({ background_task: BackgroundTaskConfigSchema.optional(), notification: NotificationConfigSchema.optional(), git_master: GitMasterConfigSchema.optional(), - failover: FailoverConfigSchema.optional(), }) export type OhMyOpenCodeConfig = z.infer @@ -343,7 +334,6 @@ export type SkillDefinition = z.infer export type RalphLoopConfig = z.infer export type NotificationConfig = z.infer export type CategoryConfig = z.infer -export type FailoverConfig = z.infer export type CategoriesConfig = z.infer export type BuiltinCategoryName = z.infer export type GitMasterConfig = z.infer diff --git a/src/features/failover/types.ts b/src/features/failover/types.ts index 7e178c1737..38b5adce7c 100644 --- a/src/features/failover/types.ts +++ b/src/features/failover/types.ts @@ -1,5 +1,3 @@ -export type FailoverStrategy = 'auto' | 'aggressive' | 'conservative'; - export type ProviderStatus = 'HEALTHY' | 'COOLING' | 'LOCKED' | 'PROBATION'; export interface ProviderState { @@ -9,12 +7,6 @@ export interface ProviderState { retryCount: number; } -export interface FailoverConfig { - enabled: boolean; - strategy: FailoverStrategy; - cooldown?: number; -} - export type RecoveryAction = 'COOLING' | 'LOCKED' | 'RETRY' | 'SKIP'; export interface DiagnoseResult { From 140fa011d322ba336bb5a9e84ccd48fb352404cf Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:36:39 +0800 Subject: [PATCH 04/10] docs: tighten smart failover docs --- docs/features/smart-failover.md | 26 +++++++++++------------ docs/features/smart-failover.zh-CN.md | 30 +++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/features/smart-failover.md b/docs/features/smart-failover.md index 6f29deb528..be00fb8add 100644 --- a/docs/features/smart-failover.md +++ b/docs/features/smart-failover.md @@ -6,11 +6,12 @@ The Smart Failover system provides an automated detection and recovery mechanism ## 2. Key Features - **Pipe Syntax (`|`)**: Minimalist fallback chain definitions. +- **Array Syntax (`string[]`)**: Equivalent to pipe syntax, easier to edit. - **Instant Failover**: Aborts OpenCode's internal retry loops to trigger immediate model swapping. -- **Error Diagnosis**: Parses `Retry-After` headers and vendor-specific error payloads. +- **Error Diagnosis (Best-Effort)**: Classifies common failures (rate-limit, quota, balance) via pattern matching. - **Guardrails**: - **Context Compatibility**: Skips fallbacks with insufficient context windows. - - **Half-Open Probation**: Tests cooling providers with a single probe before full recovery. + - **Probation Recovery**: After a cooldown elapses, a model becomes eligible again (PROBATION) and is cleared back to healthy after the session becomes idle. - **Memory Safety**: Automatic cleanup upon session deletion. ## 3. Configuration @@ -27,10 +28,7 @@ Both forms are equivalent: the first entry is the primary model, and the rest ar ### Example ```jsonc { - "model": "openai/gpt-5.2-codex | google/gemini-3-pro", - "failover": { - "strategy": "auto" - } + "model": "openai/gpt-5.2-codex | google/gemini-3-pro" } ``` @@ -43,14 +41,16 @@ Both forms are equivalent: the first entry is the primary model, and the rest ar `model` can also be configured per-agent (e.g. `agents.Sisyphus.model`) and in category configs. Those locations also accept either pipe syntax or an array. -### 3.2 Failover Strategy -`failover.strategy` currently does not change runtime behavior. It is accepted by the config schema for forward-compatibility. +## 4. Default Behavior +- **Triggers**: Retry-loop detection (`session.status: retry`) and certain session errors (`session.error`) mark the current `provider/model` as unavailable and switch to the next available fallback. +- **Cooling + Backoff**: A cooling period is applied with exponential backoff based on repeated failures. +- **Locking**: Balance/quota exhaustion signals lock a specific `provider/model` pair (model key) until reset. +- **Fallback Selection**: Only HEALTHY/PROBATION models are eligible; fallbacks with too-small context windows are skipped. -Current behavior matches an “auto” style flow: -- Detects retry loops and certain provider errors (e.g. 429 / 5xx, quota/rate-limit signals). -- Moves providers into COOLING with exponential backoff, then PROBATION, and finally HEALTHY after a successful probe. -- Locks providers on balance/quota exhaustion signals and avoids them until reset. +## 5. Limitations +- **Retry-After**: The implementation does not reliably receive response headers in events, so header-based cooldown is best-effort. +- **Probation**: Recovery is approximated by the session becoming idle, not a dedicated health-check request. -## 4. UI/UX +## 6. UI/UX - **Notification**: A yellow toast appears: `⚠️ Switched to google/gemini-3-pro`. - **Throttling**: Toasts are shown only once per session to prevent UI spam. diff --git a/docs/features/smart-failover.zh-CN.md b/docs/features/smart-failover.zh-CN.md index befbe3c6a6..b7c52e9970 100644 --- a/docs/features/smart-failover.zh-CN.md +++ b/docs/features/smart-failover.zh-CN.md @@ -6,11 +6,12 @@ Smart Failover 系统为 `oh-my-opencode` 引入了一套自动化的故障检 ## 2. 核心特性 - **管道符配置 (`|`)**: 极简的备选链定义方式。 +- **数组配置(string[])**: 与管道符等价,更易维护。 - **秒级无感切换**: 自动强行终止 OpenCode 原生的卡顿重试循环,秒切备用线路。 -- **工业级错误诊断**: 智能解析 `Retry-After` 头部及各种供应商特有的错误语义。 +- **错误诊断(Best-Effort)**: 主要通过模式匹配识别常见失败原因(限流、配额、余额等)。 - **安全防护栏**: - **上下文窗口对齐**: 自动跳过窗口过小的备用模型。 - - **半开探测 (Half-Open)**: 冷却结束后先进行单次探测,防止盲目恢复。 + - **PROBATION 恢复**: 冷却结束后模型进入 PROBATION,可再次被选用;会话进入 idle 后清理回健康状态。 - **内存管理**: 随会话销毁自动清理缓存,无内存泄露风险。 ## 3. 使用方法 @@ -27,10 +28,7 @@ Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用 ### 示例配置 ```jsonc { - "model": "openai/gpt-5.2-codex | google/gemini-3-pro", - "failover": { - "strategy": "auto" - } + "model": "openai/gpt-5.2-codex | google/gemini-3-pro" } ``` @@ -43,20 +41,22 @@ Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用 `model` 也可以写在单个 agent(例如 `agents.Sisyphus.model`)或 category 配置里,上述两种写法同样支持。 -### 3.2 策略说明 -`failover.strategy` 目前不会改变运行时行为,仅作为配置 schema 的前向兼容字段保留(后续可用于区分更激进/更保守的恢复策略)。 +## 4. 默认行为说明 +- **触发条件**:检测到 retry loop(`session.status: retry`)或部分会话错误(`session.error`)后,会把当前 `provider/model` 标记为不可用,并切换到下一个可用的备用模型。 +- **冷却与退避**:进入 COOLING 并按失败次数做指数退避。 +- **锁定**:余额不足/配额耗尽等信号会锁定特定的 `provider/model` 组合(modelKey),直到重置。 +- **fallback 选择**:只会选择 HEALTHY/PROBATION 的模型;上下文窗口过小的 fallback 会被跳过。 -当前行为等同于 “auto 风格”: -- 识别 retry loop 与部分供应商错误(如 429 / 5xx、配额/限流信号)。 -- 进入 COOLING 并做指数退避,冷却结束进入 PROBATION,探测成功后恢复 HEALTHY。 -- 余额不足/配额耗尽等信号会进入 LOCKED,并避免继续使用直到重置。 +## 5. 限制说明 +- **Retry-After**:事件里不一定能拿到响应头,因此基于 header 的冷却时间属于 best-effort。 +- **PROBATION**:当前以会话进入 idle 作为“恢复健康”的近似信号,并非专门的 health-check 请求闭环。 -## 4. 故障状态说明 +## 6. 故障状态说明 - **HEALTHY (健康)**: 正常使用。 - **COOLING (冷却中)**: 触发 429 或 5xx 错误,根据指数退避算法进入等待期。 - **LOCKED (锁定)**: 触发余额不足或配额耗尽,除非重启或修改配置,否则不再尝试。 -- **PROBATION (试用期)**: 冷却结束,允许一个请求尝试“探路”,成功则转为健康。 +- **PROBATION (试用期)**: 冷却结束后可再次被选用;会话进入 idle 后清理回健康状态。 -## 5. UI 交互 +## 7. UI 交互 - **故障提示**: 切换时会弹出黄色 Toast,内容如 `⚠️ Switched to google/gemini-3-pro`。 - **静默机制**: 每个会话仅提示一次,后续切换保持静默,不干扰工作。 From aaeb7c18a853ed7e6c7a45ce1f7a5b70c602fcd8 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:36:49 +0800 Subject: [PATCH 05/10] fix(failover): parse rate limit reset correctly --- src/features/failover/diagnoser.test.ts | 42 +++++++++++++++++++ src/features/failover/diagnoser.ts | 55 ++++++++++++++++++++----- 2 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 src/features/failover/diagnoser.test.ts diff --git a/src/features/failover/diagnoser.test.ts b/src/features/failover/diagnoser.test.ts new file mode 100644 index 0000000000..84843f2de4 --- /dev/null +++ b/src/features/failover/diagnoser.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, test } from "bun:test" +import { ErrorDiagnoser } from "./diagnoser" + +describe("ErrorDiagnoser header parsing", () => { + test("Retry-After seconds", () => { + const res = ErrorDiagnoser.diagnose("any", { "retry-after": "120" }) + expect(res.action).toBe("COOLING") + expect(res.cooldownMs).toBe(120_000) + }) + + test("Retry-After HTTP-date", () => { + const target = new Date(Date.now() + 60_000).toUTCString() + const res = ErrorDiagnoser.diagnose("any", { "retry-after": target }) + expect(res.action).toBe("COOLING") + expect(res.cooldownMs ?? 0).toBeGreaterThan(40_000) + expect(res.cooldownMs ?? 0).toBeLessThan(80_000) + }) + + test("x-ratelimit-reset epoch seconds", () => { + const epochSeconds = Math.floor((Date.now() + 90_000) / 1000) + const res = ErrorDiagnoser.diagnose("any", { "x-ratelimit-reset": String(epochSeconds) }) + expect(res.action).toBe("COOLING") + expect(res.cooldownMs ?? 0).toBeGreaterThan(70_000) + expect(res.cooldownMs ?? 0).toBeLessThan(120_000) + }) + + test("x-ratelimit-reset delta seconds", () => { + const res = ErrorDiagnoser.diagnose("any", { "x-ratelimit-reset": "45" }) + expect(res.action).toBe("COOLING") + expect(res.cooldownMs ?? 0).toBeGreaterThan(40_000) + expect(res.cooldownMs ?? 0).toBeLessThan(60_000) + }) + + test("x-ratelimit-reset epoch milliseconds", () => { + const epochMs = Date.now() + 80_000 + const res = ErrorDiagnoser.diagnose("any", { "x-ratelimit-reset": String(epochMs) }) + expect(res.action).toBe("COOLING") + expect(res.cooldownMs ?? 0).toBeGreaterThan(60_000) + expect(res.cooldownMs ?? 0).toBeLessThan(110_000) + }) +}) + diff --git a/src/features/failover/diagnoser.ts b/src/features/failover/diagnoser.ts index c51a3b0ce7..e1a20f0bd5 100644 --- a/src/features/failover/diagnoser.ts +++ b/src/features/failover/diagnoser.ts @@ -19,21 +19,56 @@ const PATTERNS: Array<{ regex: RegExp; action: RecoveryAction; type: string }> = { regex: /token limit/i, action: "SKIP", type: "context_length" }, ] +function parseRetryAfterMs(value: string, headerName: "retry-after" | "x-ratelimit-reset"): number | null { + const trimmed = value.trim() + if (!trimmed) return null + + const now = Date.now() + + if (/^\d+(\.\d+)?$/.test(trimmed)) { + const num = Number(trimmed) + if (!Number.isFinite(num) || num < 0) return null + + if (headerName === "retry-after") { + return Math.round(num * 1000) + } + + const isLikelyEpochSeconds = num >= 1_000_000_000 + const isLikelyEpochMs = num >= 1_000_000_000_000 + + const targetMs = isLikelyEpochMs + ? Math.round(num) + : isLikelyEpochSeconds + ? Math.round(num * 1000) + : now + Math.round(num * 1000) + + const delta = targetMs - now + return delta > 0 ? delta : null + } + + const dateMs = Date.parse(trimmed) + if (Number.isNaN(dateMs)) return null + + const delta = dateMs - now + return delta > 0 ? delta : null +} + export class ErrorDiagnoser { static diagnose(error: unknown, headers?: Record): DiagnoseResult { const errorStr = String(error) if (headers) { - const retryAfter = headers["retry-after"] || headers["x-ratelimit-reset"] - if (retryAfter) { - const seconds = parseInt(retryAfter, 10) - if (!isNaN(seconds)) { - return { - action: "COOLING", - reason: `Retry-After header: ${seconds}s`, - cooldownMs: seconds * 1000 - } - } + const retryAfter = headers["retry-after"] + const rateLimitReset = headers["x-ratelimit-reset"] + + const retryAfterMs = retryAfter ? parseRetryAfterMs(retryAfter, "retry-after") : null + if (retryAfterMs !== null) { + return { action: "COOLING", reason: `Retry-After header`, cooldownMs: retryAfterMs } + } + + const resetMs = rateLimitReset ? parseRetryAfterMs(rateLimitReset, "x-ratelimit-reset") : null + if (resetMs !== null) { + return { action: "COOLING", reason: `x-ratelimit-reset header`, cooldownMs: resetMs } } } From e3e16b1ff66789ab8373c5275a226d5b2e4262fe Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 01:36:59 +0800 Subject: [PATCH 06/10] test(smart-failover): await toast scheduling --- src/hooks/smart-failover/index.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hooks/smart-failover/index.test.ts b/src/hooks/smart-failover/index.test.ts index 93035c63fb..5b689c68fc 100644 --- a/src/hooks/smart-failover/index.test.ts +++ b/src/hooks/smart-failover/index.test.ts @@ -66,6 +66,7 @@ describe("smart-failover hook", () => { ) expect(output.message.model).toEqual({ providerID: "fallback", modelID: "model" }) + await new Promise(resolve => setTimeout(resolve, 1600)) expect(ctx.client.tui.showToast).toHaveBeenCalled() }) From e806b630d2cdc27341d1966086a9182e5aeaa880 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 08:31:59 +0800 Subject: [PATCH 07/10] refactor(smart-failover): address copilot review --- src/features/failover/diagnoser.ts | 25 +++++++-- src/features/failover/resolver.ts | 7 +-- src/hooks/smart-failover/index.ts | 76 +++++++++++++++++++-------- src/plugin-handlers/config-handler.ts | 6 ++- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/src/features/failover/diagnoser.ts b/src/features/failover/diagnoser.ts index e1a20f0bd5..d10bdb32d1 100644 --- a/src/features/failover/diagnoser.ts +++ b/src/features/failover/diagnoser.ts @@ -10,10 +10,27 @@ const PATTERNS: Array<{ regex: RegExp; action: RecoveryAction; type: string }> = { regex: /503/i, action: "COOLING", type: "server_error" }, { regex: /502/i, action: "COOLING", type: "server_error" }, { regex: /500/i, action: "COOLING", type: "server_error" }, - { regex: /unavailable/i, action: "COOLING", type: "availability" }, - { regex: /not found/i, action: "COOLING", type: "availability" }, - { regex: /does not exist/i, action: "COOLING", type: "availability" }, - { regex: /unsupported/i, action: "COOLING", type: "availability" }, + { regex: /\bservice unavailable\b/i, action: "COOLING", type: "server_error" }, + { + regex: /(?:model|deployment|endpoint|provider)\b[\s\S]{0,40}\bunavailable\b/i, + action: "COOLING", + type: "availability", + }, + { + regex: /(?:model|deployment|endpoint|provider)\b[\s\S]{0,40}\bnot found/i, + action: "COOLING", + type: "availability", + }, + { + regex: /(?:model|deployment|endpoint|provider)\b[\s\S]{0,40}\bdoes not exist/i, + action: "COOLING", + type: "availability", + }, + { + regex: /(?:model|deployment|endpoint|provider)\b[\s\S]{0,40}\bunsupported/i, + action: "COOLING", + type: "availability", + }, { regex: /context length/i, action: "SKIP", type: "context_length" }, { regex: /maximum context/i, action: "SKIP", type: "context_length" }, { regex: /token limit/i, action: "SKIP", type: "context_length" }, diff --git a/src/features/failover/resolver.ts b/src/features/failover/resolver.ts index e8c8fbd9f5..0565d7fca8 100644 --- a/src/features/failover/resolver.ts +++ b/src/features/failover/resolver.ts @@ -6,12 +6,13 @@ export function resolveModelChain(modelConfig?: string | string[]): ModelChain | let models: string[] = [] if (Array.isArray(modelConfig)) { - models = modelConfig + models = modelConfig.map(m => m.trim()).filter(m => m.length > 0) } else if (typeof modelConfig === "string") { if (modelConfig.includes("|")) { - models = modelConfig.split("|").map(m => m.trim()).filter(Boolean) + models = modelConfig.split("|").map(m => m.trim()).filter(m => m.length > 0) } else { - models = [modelConfig] + const trimmed = modelConfig.trim() + models = trimmed.length > 0 ? [trimmed] : [] } } diff --git a/src/hooks/smart-failover/index.ts b/src/hooks/smart-failover/index.ts index b3e1da7ac9..a1a0327be8 100644 --- a/src/hooks/smart-failover/index.ts +++ b/src/hooks/smart-failover/index.ts @@ -6,10 +6,19 @@ import { resolveModelChain } from "../../features/failover/resolver" import { log } from "../../shared" import type { ModelCacheState } from "../../plugin-state" -// Store both model key and agent name for context during events -const sessionContext = new Map() -const toastedSessions = new Set() -const pendingFailovers = new Set() +const TOAST_DELAY_MS = 1500 +const FAILOVER_TOAST_DURATION_MS = 5000 +const SWAP_TOAST_DURATION_MS = 10000 +const PROVIDER_LOCKED_TOAST_DURATION_MS = 6000 +const SESSION_PROMPT_INITIAL_DELAY_MS = 500 +const SESSION_PROMPT_BUSY_RETRY_DELAY_MS = 300 +const SESSION_PROMPT_MAX_RETRIES = 5 +const SESSION_PROMPT_BUSY_RETRY_BACKOFF_FACTOR = 1 +const RETRY_LOOP_COOLDOWN_MS = 300000 +const DEFAULT_COOLDOWN_MS = 300000 +const MAX_BACKOFF_EXPONENT = 14 +const MAX_COOLDOWN_MS = 21600000 +const CONTEXT_WINDOW_MIN_RATIO = 0.5 export function createSmartFailoverHook( ctx: PluginInput, @@ -17,11 +26,16 @@ export function createSmartFailoverHook( modelCacheState: ModelCacheState ) { const statusManager = ProviderStatusManager.getInstance() + const sessionContext = new Map() + const toastedSessions = new Set() + const pendingFailovers = new Set() const findFallback = (agentName: string, currentModelKey: string, sessionID: string) => { const getModelConfig = (agent: string) => { - // @ts-ignore - return config.agents?.[agent]?.model + const agents = config.agents + if (!agents) return undefined + const agentConfig = agents[agent as keyof typeof agents] + return agentConfig?.model } let modelConfig = getModelConfig(agentName) ?? config.model @@ -39,7 +53,11 @@ export function createSmartFailoverHook( const primaryLimit = modelCacheState.modelContextLimitsCache.get(currentModelKey) const fallbackLimit = modelCacheState.modelContextLimitsCache.get(m) - if (primaryLimit && fallbackLimit && fallbackLimit < primaryLimit * 0.5) { + if ( + primaryLimit && + fallbackLimit && + fallbackLimit < primaryLimit * CONTEXT_WINDOW_MIN_RATIO + ) { log(`[SmartFailover] Skipping fallback ${m} due to small context window (${fallbackLimit} < ${primaryLimit})`, { sessionID }) return false } @@ -47,7 +65,12 @@ export function createSmartFailoverHook( }) } - const performFailover = async (sessionID: string, currentModelKey: string, agent: string, reason: string) => { + const performFailover = async ( + sessionID: string, + currentModelKey: string, + agent: string, + reason: string + ) => { if (pendingFailovers.has(sessionID)) return false pendingFailovers.add(sessionID) @@ -66,10 +89,10 @@ export function createSmartFailoverHook( title: "Failover Active", message: `⚠️ ${currentModelKey} unavailable. Switched to ${fallback}.`, variant: "warning", - duration: 5000 + duration: FAILOVER_TOAST_DURATION_MS } }).catch(() => {}) - }, 1500) + }, TOAST_DELAY_MS) toastedSessions.add(sessionID) } @@ -78,7 +101,6 @@ export function createSmartFailoverHook( await ctx.client.session.abort({ path: { id: sessionID } }).catch(() => {}) let retryAttempt = 0 - const maxRetries = 5 const checkAndPrompt = async () => { try { await ctx.client.session.prompt({ @@ -91,9 +113,12 @@ export function createSmartFailoverHook( }) pendingFailovers.delete(sessionID) } catch (e: any) { - if (e.message?.includes("busy") && retryAttempt < maxRetries) { + if (e.message?.includes("busy") && retryAttempt < SESSION_PROMPT_MAX_RETRIES) { retryAttempt++ - setTimeout(checkAndPrompt, 300) + const backoffDelay = + SESSION_PROMPT_BUSY_RETRY_DELAY_MS + + retryAttempt * SESSION_PROMPT_BUSY_RETRY_BACKOFF_FACTOR * 100 + setTimeout(checkAndPrompt, backoffDelay) } else { log("[SmartFailover] Retry prompt failed", e) pendingFailovers.delete(sessionID) @@ -101,15 +126,16 @@ export function createSmartFailoverHook( } } - setTimeout(checkAndPrompt, 500) + setTimeout(checkAndPrompt, SESSION_PROMPT_INITIAL_DELAY_MS) return true } } pendingFailovers.delete(sessionID) return false } catch (e) { + log("[SmartFailover] Failover error", e) pendingFailovers.delete(sessionID) - throw e + return false } } @@ -155,10 +181,10 @@ export function createSmartFailoverHook( title: "Failover Active", message: `⚠️ ${currentModelKey} unavailable. Switched to ${fallback}.`, variant: "warning", - duration: 10000 + duration: SWAP_TOAST_DURATION_MS } }).catch(() => {}) - }, 1500) + }, TOAST_DELAY_MS) toastedSessions.add(input.sessionID) } } @@ -188,14 +214,15 @@ export function createSmartFailoverHook( } if (input.event.type === "session.status") { - const props = input.event.properties as { status: { type: string; message?: string }, sessionID: string } - if (props.status.type === "retry") { + const props = input.event.properties as { status?: { type?: string; message?: string }, sessionID?: string } + if (props.status?.type === "retry") { const sessionID = props.sessionID + if (!sessionID) return const sessionCtx = sessionContext.get(sessionID) if (sessionCtx && statusManager.getStatus(sessionCtx.modelKey) !== "COOLING") { const reason = props.status.message || "Retry loop detected" - statusManager.markCooling(sessionCtx.modelKey, 300000, reason) + statusManager.markCooling(sessionCtx.modelKey, RETRY_LOOP_COOLDOWN_MS, reason) await performFailover(sessionID, sessionCtx.modelKey, sessionCtx.agent, reason) } } @@ -217,8 +244,11 @@ export function createSmartFailoverHook( const currentState = statusManager.getState(sessionCtx.modelKey) const retryCount = currentState?.retryCount ?? 0 - const backoffMultiplier = Math.pow(2, Math.min(retryCount, 14)) - const duration = (result.cooldownMs ?? 300000) * backoffMultiplier + const backoffMultiplier = Math.pow(2, Math.min(retryCount, MAX_BACKOFF_EXPONENT)) + const duration = Math.min( + (result.cooldownMs ?? DEFAULT_COOLDOWN_MS) * backoffMultiplier, + MAX_COOLDOWN_MS + ) statusManager.markCooling(sessionCtx.modelKey, duration, result.reason) @@ -233,7 +263,7 @@ export function createSmartFailoverHook( title: "Provider Locked", message: `🛑 ${sessionCtx.modelKey} locked (Balance/Quota). Update config to reset.`, variant: "error", - duration: 6000 + duration: PROVIDER_LOCKED_TOAST_DURATION_MS } }).catch(() => {}) } diff --git a/src/plugin-handlers/config-handler.ts b/src/plugin-handlers/config-handler.ts index 17d3804c5c..fa77569a3a 100644 --- a/src/plugin-handlers/config-handler.ts +++ b/src/plugin-handlers/config-handler.ts @@ -101,8 +101,10 @@ export function createConfigHandler(deps: ConfigHandlerDeps) { log(`Plugin load errors`, { errors: pluginComponents.errors }); } - const configModel = config.model as string | string[] | undefined - const systemDefaultModel = Array.isArray(configModel) ? configModel[0] : configModel + const configModel = config.model as string | string[] | undefined; + const systemDefaultModel = Array.isArray(configModel) + ? configModel[0] + : configModel; if (!systemDefaultModel?.trim()) { const paths = getOpenCodeConfigPaths({ binary: "opencode", version: null }) From 5d46bb2faa6c044e07cd20687c5920815d8c2b48 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Thu, 22 Jan 2026 08:32:07 +0800 Subject: [PATCH 08/10] docs(smart-failover): clarify enablement and cooldown --- docs/features/smart-failover.md | 7 +++++-- docs/features/smart-failover.zh-CN.md | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/features/smart-failover.md b/docs/features/smart-failover.md index be00fb8add..b10a826610 100644 --- a/docs/features/smart-failover.md +++ b/docs/features/smart-failover.md @@ -15,7 +15,7 @@ The Smart Failover system provides an automated detection and recovery mechanism - **Memory Safety**: Automatic cleanup upon session deletion. ## 3. Configuration -Smart Failover is enabled by defining a fallback chain in `model`. +Smart Failover is enabled by default (unless you disable the `smart-failover` hook). Configure a fallback chain in `model` to use it. ### 3.1 Model Fallback Chain You can define the fallback chain using either: @@ -25,6 +25,9 @@ You can define the fallback chain using either: Both forms are equivalent: the first entry is the primary model, and the rest are fallbacks. +### 3.2 Hook Toggle +If you need to disable it, add `smart-failover` to `disabled_hooks` in your `oh-my-opencode.json`. + ### Example ```jsonc { @@ -43,7 +46,7 @@ Both forms are equivalent: the first entry is the primary model, and the rest ar ## 4. Default Behavior - **Triggers**: Retry-loop detection (`session.status: retry`) and certain session errors (`session.error`) mark the current `provider/model` as unavailable and switch to the next available fallback. -- **Cooling + Backoff**: A cooling period is applied with exponential backoff based on repeated failures. +- **Cooling + Backoff**: Retry-loop cooling uses a fixed 5-minute cooldown. Session-error cooling uses exponential backoff based on repeated failures. - **Locking**: Balance/quota exhaustion signals lock a specific `provider/model` pair (model key) until reset. - **Fallback Selection**: Only HEALTHY/PROBATION models are eligible; fallbacks with too-small context windows are skipped. diff --git a/docs/features/smart-failover.zh-CN.md b/docs/features/smart-failover.zh-CN.md index b7c52e9970..ac4023c6fc 100644 --- a/docs/features/smart-failover.zh-CN.md +++ b/docs/features/smart-failover.zh-CN.md @@ -15,7 +15,7 @@ Smart Failover 系统为 `oh-my-opencode` 引入了一套自动化的故障检 - **内存管理**: 随会话销毁自动清理缓存,无内存泄露风险。 ## 3. 使用方法 -Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用。 +Smart Failover 默认启用(除非你显式禁用 `smart-failover` hook)。只需要通过 `model` 定义“主模型 + 备用模型链”即可使用。 ### 3.1 模型备用链写法 支持两种等价写法: @@ -25,6 +25,9 @@ Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用 两者语义一致:第一个是主模型,后续是备用模型。 +### 3.2 Hook 开关 +如需禁用,可在 `oh-my-opencode.json` 中把 `smart-failover` 加入 `disabled_hooks`。 + ### 示例配置 ```jsonc { @@ -43,7 +46,7 @@ Smart Failover 通过 `model` 定义“主模型 + 备用模型链”来启用 ## 4. 默认行为说明 - **触发条件**:检测到 retry loop(`session.status: retry`)或部分会话错误(`session.error`)后,会把当前 `provider/model` 标记为不可用,并切换到下一个可用的备用模型。 -- **冷却与退避**:进入 COOLING 并按失败次数做指数退避。 +- **冷却与退避**:retry loop 的冷却时间固定为 5 分钟;会话错误触发的冷却会按失败次数做指数退避。 - **锁定**:余额不足/配额耗尽等信号会锁定特定的 `provider/model` 组合(modelKey),直到重置。 - **fallback 选择**:只会选择 HEALTHY/PROBATION 的模型;上下文窗口过小的 fallback 会被跳过。 From 2fc548ee1166457c7360b573e57ac96f8b0770d1 Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Fri, 23 Jan 2026 21:53:40 +0800 Subject: [PATCH 09/10] fix(sisyphus-junior): handle empty model override arrays --- src/agents/sisyphus-junior.test.ts | 35 ++++++++++++++++++++++++++++++ src/agents/sisyphus-junior.ts | 11 ++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/agents/sisyphus-junior.test.ts b/src/agents/sisyphus-junior.test.ts index 43d75610ac..2e638e3034 100644 --- a/src/agents/sisyphus-junior.test.ts +++ b/src/agents/sisyphus-junior.test.ts @@ -14,6 +14,18 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { expect(result.model).toBe("openai/gpt-5.2") }) + test("applies model override from array (uses first non-empty)", () => { + // #given + const override = { model: ["", "openai/gpt-5.2", "google/gemini-3-pro"] } + + // #when + const result = createSisyphusJuniorAgentWithOverrides(override) + + // #then + expect(result.model).toBe("openai/gpt-5.2") + }) + + test("applies temperature override", () => { // #given const override = { temperature: 0.5 } @@ -83,6 +95,29 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { expect(result.model).toBe(SISYPHUS_JUNIOR_DEFAULTS.model) }) + test("uses systemDefaultModel when override.model is empty array", () => { + // #given + const override = { model: [] as string[] } + + // #when + const result = createSisyphusJuniorAgentWithOverrides(override, "openai/gpt-5.2") + + // #then + expect(result.model).toBe("openai/gpt-5.2") + }) + + test("falls back to defaults when override.model is empty array and systemDefaultModel missing", () => { + // #given + const override = { model: [] as string[] } + + // #when + const result = createSisyphusJuniorAgentWithOverrides(override) + + // #then + expect(result.model).toBe(SISYPHUS_JUNIOR_DEFAULTS.model) + }) + + test("uses default temperature when no override", () => { // #given const override = {} diff --git a/src/agents/sisyphus-junior.ts b/src/agents/sisyphus-junior.ts index a8e212d904..b454b7a9eb 100644 --- a/src/agents/sisyphus-junior.ts +++ b/src/agents/sisyphus-junior.ts @@ -90,8 +90,15 @@ export function createSisyphusJuniorAgentWithOverrides( override = undefined } - const model = override?.model ?? systemDefaultModel ?? SISYPHUS_JUNIOR_DEFAULTS.model - const primaryModel = Array.isArray(model) ? model[0] : model + const overrideModel = override?.model + const primaryOverrideModel = Array.isArray(overrideModel) + ? overrideModel.map((m) => m.trim()).find((m) => m.length > 0) + : typeof overrideModel === "string" + ? overrideModel.trim() || undefined + : undefined + + const trimmedSystemDefaultModel = systemDefaultModel?.trim() || undefined + const primaryModel = primaryOverrideModel ?? trimmedSystemDefaultModel ?? SISYPHUS_JUNIOR_DEFAULTS.model const temperature = override?.temperature ?? SISYPHUS_JUNIOR_DEFAULTS.temperature const promptAppend = override?.prompt_append From 3324ea02d633d71da6b1484f568c135c8bc3034f Mon Sep 17 00:00:00 2001 From: pipi-1997 <46177323+pipi-1997@users.noreply.github.com> Date: Sat, 24 Jan 2026 22:07:38 +0800 Subject: [PATCH 10/10] fix: resolve dev merge type issues~ --- src/agents/utils.ts | 17 ++++++++++++++--- src/hooks/smart-failover/index.test.ts | 2 +- src/hooks/smart-failover/index.ts | 11 ++++------- src/tools/delegate-task/tools.ts | 14 +++++++++++++- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/agents/utils.ts b/src/agents/utils.ts index 5980127343..e2beccbb7d 100644 --- a/src/agents/utils.ts +++ b/src/agents/utils.ts @@ -151,6 +151,17 @@ function mergeAgentConfig( return merged } +function extractPrimaryModel(model?: string | string[]): string | undefined { + if (Array.isArray(model)) { + return model.map((m) => m.trim()).find((m) => m.length > 0) + } + if (typeof model === "string") { + if (model.includes("|")) return model.split("|")[0].trim() || undefined + return model.trim() || undefined + } + return undefined +} + function mapScopeToLocation(scope: SkillScope): AvailableSkill["location"] { if (scope === "user" || scope === "opencode") return "user" if (scope === "project" || scope === "opencode-project") return "project" @@ -216,7 +227,7 @@ export async function createBuiltinAgents( const requirement = AGENT_MODEL_REQUIREMENTS[agentName] const { model, variant: resolvedVariant } = resolveModelWithFallback({ - userModel: override?.model, + userModel: extractPrimaryModel(override?.model), fallbackChain: requirement?.fallbackChain, availableModels, systemDefaultModel, @@ -256,7 +267,7 @@ export async function createBuiltinAgents( const sisyphusRequirement = AGENT_MODEL_REQUIREMENTS["sisyphus"] const { model: sisyphusModel, variant: sisyphusResolvedVariant } = resolveModelWithFallback({ - userModel: sisyphusOverride?.model, + userModel: extractPrimaryModel(sisyphusOverride?.model), fallbackChain: sisyphusRequirement?.fallbackChain, availableModels, systemDefaultModel, @@ -293,7 +304,7 @@ export async function createBuiltinAgents( const atlasRequirement = AGENT_MODEL_REQUIREMENTS["atlas"] const { model: atlasModel, variant: atlasResolvedVariant } = resolveModelWithFallback({ - userModel: orchestratorOverride?.model, + userModel: extractPrimaryModel(orchestratorOverride?.model), fallbackChain: atlasRequirement?.fallbackChain, availableModels, systemDefaultModel, diff --git a/src/hooks/smart-failover/index.test.ts b/src/hooks/smart-failover/index.test.ts index 5b689c68fc..84ae255357 100644 --- a/src/hooks/smart-failover/index.test.ts +++ b/src/hooks/smart-failover/index.test.ts @@ -30,7 +30,7 @@ describe("smart-failover hook", () => { config = { model: "primary/model", agents: { - Sisyphus: { + sisyphus: { model: "primary/model | fallback/model" } } diff --git a/src/hooks/smart-failover/index.ts b/src/hooks/smart-failover/index.ts index a1a0327be8..b469240d97 100644 --- a/src/hooks/smart-failover/index.ts +++ b/src/hooks/smart-failover/index.ts @@ -32,17 +32,14 @@ export function createSmartFailoverHook( const findFallback = (agentName: string, currentModelKey: string, sessionID: string) => { const getModelConfig = (agent: string) => { - const agents = config.agents + const agents = config.agents as unknown as + | Record + | undefined if (!agents) return undefined - const agentConfig = agents[agent as keyof typeof agents] - return agentConfig?.model + return agents[agent]?.model ?? agents[agent.toLowerCase()]?.model } let modelConfig = getModelConfig(agentName) ?? config.model - - if (!modelConfig && agentName !== "Sisyphus") { - modelConfig = getModelConfig("Sisyphus") - } const chain = resolveModelChain(modelConfig as string | string[]) if (!chain) return undefined diff --git a/src/tools/delegate-task/tools.ts b/src/tools/delegate-task/tools.ts index 11f72af4d7..f478fad18a 100644 --- a/src/tools/delegate-task/tools.ts +++ b/src/tools/delegate-task/tools.ts @@ -28,6 +28,17 @@ function parseModelString(model: string): { providerID: string; modelID: string return undefined } +function extractPrimaryModel(model?: string | string[]): string | undefined { + if (Array.isArray(model)) { + return model.map((m) => m.trim()).find((m) => m.length > 0) + } + if (typeof model === "string") { + if (model.includes("|")) return model.split("|")[0].trim() || undefined + return model.trim() || undefined + } + return undefined +} + function getMessageDir(sessionID: string): string | null { if (!existsSync(MESSAGE_STORAGE)) return null @@ -516,8 +527,9 @@ To resume this session: resume="${args.resume}"` actualModel = resolved.model modelInfo = { model: actualModel, type: "system-default", source: "system-default" } } else { + const userModel = extractPrimaryModel(userCategories?.[args.category]?.model) ?? sisyphusJuniorModel const { model: resolvedModel, source, variant: resolvedVariant } = resolveModelWithFallback({ - userModel: userCategories?.[args.category]?.model ?? sisyphusJuniorModel, + userModel, fallbackChain: requirement.fallbackChain, availableModels, systemDefaultModel,