Skip to content

Commit 002517f

Browse files
committed
error handling: improve diagnostics and add retry logic for OpenAI API requests
- Add RetryConfig and exponential backoff for transient API failures - Enhance error messages and logging for command execution, plan validation, and API interactions - Propagate context and step information in errors for better traceability - Ensure log file handles are closed on runtime shutdown - Update TUI to disable mouse reporting for improved text selection - Add safeguards to history compaction to prevent infinite loops
1 parent 367d258 commit 002517f

15 files changed

+367
-82
lines changed

internal/core/runtime/command_executor.go

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -181,27 +181,39 @@ func (e *CommandExecutor) Execute(ctx context.Context, step PlanStep) (PlanObser
181181
observation.Details = runErr.Error()
182182
}
183183

184-
// If the command failed, persist a detailed failure report for inspection.
185-
if runErr != nil {
186-
_ = writeFailureLog(step, stdout, stderr, runErr)
187-
}
188-
189184
duration := time.Since(start)
190-
e.metrics.RecordCommandExecution(step.ID, duration, runErr == nil)
185+
186+
// If the command failed, persist a detailed failure report for inspection.
191187
if runErr != nil {
188+
if err := writeFailureLog(step, stdout, stderr, runErr); err != nil {
189+
// Log warning but don't fail execution - failure logging is best-effort
190+
e.logger.Warn(ctx, "Failed to write failure log",
191+
Field("step_id", step.ID),
192+
Field("error", err.Error()),
193+
)
194+
}
195+
e.metrics.RecordCommandExecution(step.ID, duration, false)
192196
e.logger.Error(ctx, "Command execution failed", runErr,
193197
Field("step_id", step.ID),
194198
Field("shell", step.Command.Shell),
195199
Field("duration_ms", duration.Milliseconds()),
196200
)
197-
} else {
198-
e.logger.Debug(ctx, "Command execution completed",
199-
Field("step_id", step.ID),
200-
Field("duration_ms", duration.Milliseconds()),
201-
)
201+
// Return error with step context
202+
if exitErr == nil {
203+
return observation, fmt.Errorf("command[%s]: execution failed: %w", step.ID, runErr)
204+
}
205+
// Exit errors include exit code in the wrapped error
206+
return observation, fmt.Errorf("command[%s]: exited with code %d: %w", step.ID, *observation.ExitCode, runErr)
202207
}
203208

204-
return observation, runErr
209+
e.metrics.RecordCommandExecution(step.ID, duration, true)
210+
e.logger.Debug(ctx, "Command execution completed",
211+
Field("step_id", step.ID),
212+
Field("duration_ms", duration.Milliseconds()),
213+
)
214+
215+
// Success - no error to return
216+
return observation, nil
205217
}
206218

207219
// writeFailureLog persists a diagnostic file under .goagent/ whenever a command
@@ -266,26 +278,41 @@ func writeFailureLog(step PlanStep, fullStdout, fullStderr []byte, runErr error)
266278
_, _ = b.Write([]byte("\n"))
267279
}
268280

269-
return os.WriteFile(path, b.Bytes(), 0o644)
281+
if err := os.WriteFile(path, b.Bytes(), 0o644); err != nil {
282+
return fmt.Errorf("writeFailureLog: failed to write file %q: %w", path, err)
283+
}
284+
return nil
270285
}
271286

272287
func (e *CommandExecutor) executeInternal(ctx context.Context, step PlanStep) (PlanObservationPayload, error) {
273288
invocation, err := parseInternalInvocation(step)
274289
if err != nil {
275-
return PlanObservationPayload{}, fmt.Errorf("command: %w", err)
290+
e.logger.Error(ctx, "Failed to parse internal command invocation", err,
291+
Field("step_id", step.ID),
292+
Field("command_run", step.Command.Run),
293+
)
294+
return PlanObservationPayload{}, fmt.Errorf("command[%s]: parse internal invocation: %w", step.ID, err)
276295
}
277296

278297
handler, ok := e.internal[invocation.Name]
279298
if !ok {
280-
return PlanObservationPayload{}, fmt.Errorf("command: unknown internal command %q", invocation.Name)
299+
e.logger.Error(ctx, "Unknown internal command", nil,
300+
Field("step_id", step.ID),
301+
Field("command_name", invocation.Name),
302+
)
303+
return PlanObservationPayload{}, fmt.Errorf("command[%s]: unknown internal command %q", step.ID, invocation.Name)
281304
}
282305

283306
payload, execErr := handler(ctx, invocation)
284307
if execErr != nil {
308+
e.logger.Error(ctx, "Internal command execution failed", execErr,
309+
Field("step_id", step.ID),
310+
Field("command_name", invocation.Name),
311+
)
285312
if payload.Details == "" {
286313
payload.Details = execErr.Error()
287314
}
288-
return payload, execErr
315+
return payload, fmt.Errorf("command[%s]: internal command %q failed: %w", step.ID, invocation.Name, execErr)
289316
}
290317
if payload.ExitCode == nil {
291318
zero := 0
@@ -298,7 +325,7 @@ func parseInternalInvocation(step PlanStep) (InternalCommandRequest, error) {
298325
run := strings.TrimSpace(step.Command.Run)
299326
tokens, err := tokenizeInternalCommand(run)
300327
if err != nil {
301-
return InternalCommandRequest{}, err
328+
return InternalCommandRequest{}, fmt.Errorf("parse internal command %q: %w", run, err)
302329
}
303330
if len(tokens) == 0 {
304331
return InternalCommandRequest{}, errors.New("internal command: missing command name")

internal/core/runtime/execution.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,20 @@ func (r *Runtime) executePendingCommands(ctx context.Context, toolCall ToolCall)
250250
PlanObservation: []StepObservation{stepResult},
251251
}}
252252
if updateErr := r.plan.UpdateStatus(step.ID, status, planObservation); updateErr != nil {
253+
updateErr = fmt.Errorf("execution: failed to update plan status for step %q: %w", step.ID, updateErr)
254+
r.options.Logger.Error(ctx, "Failed to update plan status", updateErr,
255+
Field("step_id", step.ID),
256+
Field("status", string(status)),
257+
)
253258
r.emit(RuntimeEvent{
254259
Type: EventTypeError,
255260
Message: fmt.Sprintf("Failed to update plan status for step %s: %v", step.ID, updateErr),
256261
Level: StatusLevelError,
262+
Metadata: map[string]any{
263+
"step_id": step.ID,
264+
"status": string(status),
265+
"error": updateErr.Error(),
266+
},
257267
})
258268
if finalErr == nil {
259269
finalErr = updateErr

internal/core/runtime/history.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package runtime
22

33
import (
4+
"context"
45
"encoding/json"
56
"fmt"
67
"os"
@@ -37,11 +38,32 @@ func (r *Runtime) planningHistorySnapshot() []ChatMessage {
3738
total, per := estimateHistoryTokenUsage(r.history)
3839
if total > limit {
3940
beforeLen := len(r.history)
40-
compactHistory(r.history, per, total, limit)
41+
// Add safeguard: limit iterations to prevent infinite loops
42+
// If summarization doesn't reduce tokens enough, we'll stop after max iterations
43+
const maxCompactionIterations = 10
44+
iterations := 0
45+
for total > limit && iterations < maxCompactionIterations {
46+
var changed bool
47+
total, per, changed = compactHistory(r.history, per, total, limit)
48+
iterations++
49+
if !changed {
50+
// No progress made - all eligible messages already summarized
51+
// or we can't make progress. Break to avoid infinite loop.
52+
break
53+
}
54+
}
4155
afterLen := len(r.history)
4256
removed := beforeLen - afterLen
4357
// Note: removed might be 0 if we just summarized without removing entries
4458
r.options.Metrics.RecordContextCompaction(removed, afterLen)
59+
60+
if iterations >= maxCompactionIterations && total > limit {
61+
r.options.Logger.Warn(context.Background(), "History compaction reached max iterations without meeting budget",
62+
Field("total_tokens", total),
63+
Field("limit", limit),
64+
Field("iterations", iterations),
65+
)
66+
}
4567
}
4668
}
4769

internal/core/runtime/loop.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,6 @@ func (r *Runtime) handlePrompt(ctx context.Context, evt InputEvent) error {
154154
Field("prompt_length", len(prompt)),
155155
)
156156

157-
r.options.Logger.Info(ctx, "Processing user prompt",
158-
Field("prompt_length", len(prompt)),
159-
)
160-
161157
r.emit(RuntimeEvent{
162158
Type: EventTypeStatus,
163159
Message: fmt.Sprintf("Processing prompt with model %s…", r.options.Model),
@@ -217,12 +213,16 @@ func (r *Runtime) requestPlan(ctx context.Context) (*PlanResponse, ToolCall, err
217213
toolCall, err = r.client.RequestPlan(ctx, history)
218214
}
219215
if err != nil {
220-
return nil, ToolCall{}, err
216+
r.options.Logger.Error(ctx, "Failed to request plan from OpenAI", err)
217+
return nil, ToolCall{}, fmt.Errorf("requestPlan: API request failed: %w", err)
221218
}
222219

223220
plan, retry, validationErr := r.validatePlanToolCall(toolCall)
224221
if validationErr != nil {
225-
return nil, ToolCall{}, validationErr
222+
r.options.Logger.Error(ctx, "Plan validation failed", validationErr,
223+
Field("tool_call_id", toolCall.ID),
224+
)
225+
return nil, ToolCall{}, fmt.Errorf("requestPlan: validation failed: %w", validationErr)
226226
}
227227
if retry {
228228
retryCount++

internal/core/runtime/loop_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ func TestPlanExecutionLoopPausesForHumanInput(t *testing.T) {
140140
"data: [DONE]\n\n"
141141
transport := &stubTransport{body: []byte(sse), statusCode: http.StatusOK}
142142

143-
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil)
143+
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil, nil, 120*time.Second)
144144
if err != nil {
145145
t.Fatalf("failed to create client: %v", err)
146146
}
@@ -226,7 +226,7 @@ func TestPlanExecutionLoopHandsFreeCompletes(t *testing.T) {
226226
"data: [DONE]\n\n"
227227
transport := &stubTransport{body: []byte(sse), statusCode: http.StatusOK}
228228

229-
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil)
229+
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil, nil, 120*time.Second)
230230
if err != nil {
231231
t.Fatalf("failed to create client: %v", err)
232232
}
@@ -323,7 +323,7 @@ func TestPlanExecutionLoopHandsFreeStopsAtPassLimit(t *testing.T) {
323323
"data: [DONE]\n\n"
324324
transport := &stubTransport{body: []byte(sse), statusCode: http.StatusOK}
325325

326-
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil)
326+
client, err := NewOpenAIClient("test-key", "gpt-4o", "", "", nil, nil, nil, 120*time.Second)
327327
if err != nil {
328328
t.Fatalf("failed to create client: %v", err)
329329
}

internal/core/runtime/openai_client.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ type OpenAIClient struct {
2424
baseURL string
2525
logger Logger
2626
metrics Metrics
27+
retryConfig *RetryConfig
2728
}
2829

2930
const defaultOpenAIBaseURL = "https://api.openai.com/v1"
3031

3132
// NewOpenAIClient configures the client with the provided API key and model identifier.
32-
func NewOpenAIClient(apiKey, model, reasoningEffort, baseURL string, logger Logger, metrics Metrics) (*OpenAIClient, error) {
33+
func NewOpenAIClient(apiKey, model, reasoningEffort, baseURL string, logger Logger, metrics Metrics, retryConfig *RetryConfig, httpTimeout time.Duration) (*OpenAIClient, error) {
3334
if apiKey == "" {
3435
return nil, errors.New("openai: API key is required")
3536
}
@@ -55,12 +56,13 @@ func NewOpenAIClient(apiKey, model, reasoningEffort, baseURL string, logger Logg
5556
model: model,
5657
reasoningEffort: reasoningEffort,
5758
httpClient: &http.Client{
58-
Timeout: 120 * time.Second,
59+
Timeout: httpTimeout,
5960
},
60-
tool: tool,
61-
baseURL: baseURL,
62-
logger: logger,
63-
metrics: metrics,
61+
tool: tool,
62+
baseURL: baseURL,
63+
logger: logger,
64+
metrics: metrics,
65+
retryConfig: retryConfig,
6466
}, nil
6567
}
6668

@@ -94,13 +96,17 @@ func (c *OpenAIClient) RequestPlanStreamingResponses(ctx context.Context, histor
9496
inputMsgs := buildMessagesFromHistory(history)
9597
payload, err := c.buildRequestBody(inputMsgs)
9698
if err != nil {
97-
return ToolCall{}, fmt.Errorf("openai(responses): encode request: %w", err)
99+
c.logger.Error(ctx, "Failed to build OpenAI request body", err,
100+
Field("model", c.model),
101+
Field("history_length", len(history)),
102+
)
103+
return ToolCall{}, fmt.Errorf("openai: build request body: %w", err)
98104
}
99105

100-
// Execute request
101-
resp, err := c.executeRequest(ctx, payload, start)
106+
// Execute request with retry logic
107+
resp, err := c.executeRequest(ctx, payload, start, c.retryConfig)
102108
if err != nil {
103-
return ToolCall{}, err
109+
return ToolCall{}, fmt.Errorf("openai: request failed after retries: %w", err)
104110
}
105111
defer func() { _ = resp.Body.Close() }()
106112

@@ -115,8 +121,9 @@ func (c *OpenAIClient) RequestPlanStreamingResponses(ctx context.Context, histor
115121
c.metrics.RecordAPICall(duration, false)
116122
c.logger.Error(ctx, "OpenAI API stream parsing failed", err,
117123
Field("duration_ms", duration.Milliseconds()),
124+
Field("model", c.model),
118125
)
119-
return ToolCall{}, err
126+
return ToolCall{}, fmt.Errorf("openai: stream parsing failed: %w", err)
120127
}
121128

122129
if toolCall.Name != "" {

internal/core/runtime/openai_client_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"net/http/httptest"
88
"net/url"
99
"testing"
10+
"time"
1011

1112
"github.com/asynkron/goagent/internal/core/schema"
1213
)
@@ -51,7 +52,7 @@ func TestRequestPlanUsesFunctionToolShape(t *testing.T) {
5152
}))
5253
defer server.Close()
5354

54-
client, err := NewOpenAIClient("test-key", "test-model", "", server.URL, nil, nil)
55+
client, err := NewOpenAIClient("test-key", "test-model", "", server.URL, nil, nil, nil, 120*time.Second)
5556
if err != nil {
5657
t.Fatalf("unexpected client error: %v", err)
5758
}

0 commit comments

Comments
 (0)