Skip to content

Commit eb01dc5

Browse files
committed
save
1 parent e831eac commit eb01dc5

File tree

4 files changed

+191
-16
lines changed

4 files changed

+191
-16
lines changed

vite-app/src/App.tsx

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,35 @@
1-
import { useEffect, useState, useRef } from "react";
1+
import { useEffect, useRef } from "react";
2+
import { makeAutoObservable } from "mobx";
3+
import { observer } from "mobx-react";
4+
import Dashboard from "./components/Dashboard";
5+
import type { EvaluationRow } from "./types/eval-protocol";
26
interface FileUpdate {
37
type: "file_changed" | "file_created" | "file_deleted";
48
path: string;
59
timestamp: string;
610
}
711

8-
function App() {
9-
const [isConnected, setIsConnected] = useState(false);
12+
class GlobalState {
13+
isConnected: boolean = false;
14+
dataset: EvaluationRow[] = [];
15+
constructor() {
16+
makeAutoObservable(this);
17+
}
18+
19+
setDataset(dataset: EvaluationRow[]) {
20+
this.dataset = dataset;
21+
}
22+
}
23+
24+
const state = new GlobalState();
25+
26+
const BASE_DELAY = 1000; // 1 second
27+
const MAX_RECONNECT_ATTEMPTS = 5;
28+
29+
const App = observer(() => {
1030
const wsRef = useRef<WebSocket | null>(null);
1131
const reconnectTimeoutRef = useRef<number | null>(null);
1232
const reconnectAttemptsRef = useRef(0);
13-
const maxReconnectAttempts = 5;
14-
const baseDelay = 1000; // 1 second
1533

1634
const connectWebSocket = () => {
1735
if (wsRef.current?.readyState === WebSocket.OPEN) {
@@ -23,7 +41,7 @@ function App() {
2341

2442
ws.onopen = () => {
2543
console.log("Connected to file watcher");
26-
setIsConnected(true);
44+
state.isConnected = true;
2745
reconnectAttemptsRef.current = 0; // Reset reconnect attempts on successful connection
2846
};
2947

@@ -38,20 +56,20 @@ function App() {
3856

3957
ws.onclose = (event) => {
4058
console.log("Disconnected from file watcher", event.code, event.reason);
41-
setIsConnected(false);
59+
state.isConnected = false;
4260

4361
// Attempt to reconnect if not a normal closure
4462
if (
4563
event.code !== 1000 &&
46-
reconnectAttemptsRef.current < maxReconnectAttempts
64+
reconnectAttemptsRef.current < MAX_RECONNECT_ATTEMPTS
4765
) {
4866
scheduleReconnect();
4967
}
5068
};
5169

5270
ws.onerror = (error) => {
5371
console.error("WebSocket error:", error);
54-
setIsConnected(false);
72+
state.isConnected = false;
5573
};
5674
};
5775

@@ -60,7 +78,7 @@ function App() {
6078
clearTimeout(reconnectTimeoutRef.current);
6179
}
6280

63-
const delay = baseDelay * Math.pow(2, reconnectAttemptsRef.current); // Exponential backoff
81+
const delay = BASE_DELAY * Math.pow(2, reconnectAttemptsRef.current); // Exponential backoff
6482
console.log(
6583
`Scheduling reconnect attempt ${
6684
reconnectAttemptsRef.current + 1
@@ -70,7 +88,7 @@ function App() {
7088
reconnectTimeoutRef.current = setTimeout(() => {
7189
reconnectAttemptsRef.current++;
7290
console.log(
73-
`Attempting to reconnect (attempt ${reconnectAttemptsRef.current}/${maxReconnectAttempts})`
91+
`Attempting to reconnect (attempt ${reconnectAttemptsRef.current}/${MAX_RECONNECT_ATTEMPTS})`
7492
);
7593
connectWebSocket();
7694
}, delay);
@@ -95,15 +113,17 @@ function App() {
95113
<div>
96114
<div>
97115
<h1>Eval Protocol Logs</h1>
98-
<div>{isConnected ? "Connected" : "Disconnected"}</div>
116+
<div>{state.isConnected ? "Connected" : "Disconnected"}</div>
117+
</div>
118+
<div>
119+
<Dashboard />
99120
</div>
100-
<div>TODO</div>
101121
</div>
102122
</nav>
103123

104124
<main>TODO</main>
105125
</div>
106126
);
107-
}
127+
});
108128

109129
export default App;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import { observer } from "mobx-react";
2+
3+
const Dashboard = observer(() => {
4+
return <div>Dashboard</div>;
5+
});
6+
7+
export default Dashboard;

vite-app/src/types/eval-protocol-utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { EvaluationRow, Message } from './eval-protocol';
1+
import type { EvaluationRow, Message } from './eval-protocol';
22

33
/**
44
* Utility functions for working with EvaluationRow data
@@ -173,4 +173,4 @@ export const evaluateResultUtils = {
173173
getStepCount: (result: any): number => {
174174
return result.step_outputs?.length || 0;
175175
}
176-
};
176+
};
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import { z } from 'zod';
2+
3+
4+
// Base schemas
5+
export const ChatCompletionContentPartTextParamSchema = z.object({
6+
text: z.string().describe('The text content.'),
7+
type: z.literal('text').default('text').describe('The type of the content part.')
8+
});
9+
10+
export const FunctionCallSchema = z.object({
11+
name: z.string(),
12+
arguments: z.string()
13+
});
14+
15+
export const ChatCompletionMessageToolCallSchema = z.object({
16+
id: z.string(),
17+
type: z.literal('function'),
18+
function: FunctionCallSchema
19+
});
20+
21+
export const MessageSchema = z.object({
22+
role: z.string().describe('assistant, user, system, tool'),
23+
content: z.union([z.string(), z.array(ChatCompletionContentPartTextParamSchema)]).optional().default('').describe('The content of the message.'),
24+
name: z.string().optional(),
25+
tool_call_id: z.string().optional(),
26+
tool_calls: z.array(ChatCompletionMessageToolCallSchema).optional(),
27+
function_call: FunctionCallSchema.optional(),
28+
control_plane_step: z.record(z.string(), z.any()).optional()
29+
});
30+
31+
export const MetricResultSchema = z.object({
32+
is_score_valid: z.boolean().default(true),
33+
score: z.number().min(0.0).max(1.0),
34+
reason: z.string()
35+
});
36+
37+
export const StepOutputSchema = z.object({
38+
step_index: z.union([z.number(), z.string()]).describe('User-defined index for the step (e.g., assistant message index, turn number). This is used by the system to map this output to the internal StepData.'),
39+
base_reward: z.number().describe('Base reward calculated by the user\'s reward function for this step.'),
40+
terminated: z.boolean().default(false).describe('Whether the environment signaled termination at this step.'),
41+
control_plane_info: z.record(z.string(), z.any()).optional().describe('Structured info from the environment\'s control plane.'),
42+
metrics: z.record(z.string(), z.any()).default({}).describe('Optional dictionary of custom metrics for this step.'),
43+
reason: z.string().optional().describe('Optional explanation for the step\'s base reward or metrics.')
44+
});
45+
46+
export const EvaluateResultSchema = z.object({
47+
score: z.number().describe('The overall evaluation score, typically between 0.0 and 1.0.'),
48+
is_score_valid: z.boolean().default(true).describe('Whether the overall score is valid.'),
49+
reason: z.string().optional().describe('Optional explanation for the overall score.'),
50+
metrics: z.record(z.string(), MetricResultSchema).default({}).describe('Dictionary of component metrics for detailed breakdown.'),
51+
step_outputs: z.array(StepOutputSchema).optional().describe('For RL, a list of outputs for each conceptual step, providing base rewards.'),
52+
error: z.string().optional().describe('Optional error message if the evaluation itself encountered an issue.'),
53+
trajectory_info: z.record(z.string(), z.any()).optional().describe('Additional trajectory-level information (duration, steps, termination_reason, etc.).'),
54+
final_control_plane_info: z.record(z.string(), z.any()).optional().describe('The final control plane state that led to termination.')
55+
});
56+
57+
export const CompletionParamsSchema = z.object({
58+
model: z.string().describe('Model identifier (e.g., \'gpt-4.1\', \'fireworks/llama\')'),
59+
temperature: z.number().optional().describe('Temperature setting for model generation'),
60+
max_tokens: z.number().optional().describe('Maximum tokens to generate'),
61+
max_tool_calls: z.number().optional().describe('Maximum tool calls per turn')
62+
});
63+
64+
export const InputMetadataSchema = z.object({
65+
row_id: z.string().describe('Unique string to ID the row'),
66+
completion_params: CompletionParamsSchema.optional().describe('Completion endpoint parameters used'),
67+
dataset_info: z.record(z.string(), z.any()).optional().describe('Dataset row details: seed, system_prompt, environment_context, etc'),
68+
session_data: z.record(z.string(), z.any()).optional().describe('Session metadata like timestamp (input only, no duration/usage)')
69+
}).loose();
70+
71+
export const CompletionUsageSchema = z.object({
72+
prompt_tokens: z.number(),
73+
completion_tokens: z.number(),
74+
total_tokens: z.number()
75+
});
76+
77+
export const EvaluationRowSchema = z.object({
78+
messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
79+
tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
80+
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
81+
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
82+
evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
83+
usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),
84+
created_at: z.date().default(() => new Date()).describe('The timestamp when the row was created.')
85+
});
86+
87+
// Agent Evaluation Framework (V2) schemas
88+
export const ResourceServerConfigSchema = z.object({
89+
start_command: z.string().describe('The command to start the server. The string \'{port}\' will be replaced with a dynamically allocated free port.'),
90+
health_check_url: z.string().describe('The URL to poll to check if the server is ready. The string \'{port}\' will be replaced with the allocated port.')
91+
});
92+
93+
export const EvaluationCriteriaModelSchema = z.object({
94+
final_state_query: z.string().optional().describe('A query (e.g., SQL) to run on the final state of the resource.'),
95+
expected_query_result_transform: z.string().optional().describe('A Python lambda string (e.g., \'lambda x: x > 0\') to transform and evaluate the query result to a boolean.'),
96+
ground_truth_function_calls: z.array(z.array(z.string())).optional().describe('Ground truth function calls for BFCL evaluation.'),
97+
ground_truth_comparable_state: z.record(z.string(), z.any()).optional().describe('Ground truth comparable state for BFCL evaluation.')
98+
});
99+
100+
export const TaskDefinitionModelSchema = z.object({
101+
name: z.string().describe('Unique name for the task.'),
102+
description: z.string().optional().describe('A brief description of the task.'),
103+
resource_type: z.string().describe('The type of ForkableResource to use (e.g., \'SQLResource\', \'PythonStateResource\', \'FileSystemResource\', \'DockerResource\').'),
104+
base_resource_config: z.record(z.string(), z.any()).default({}).describe('Configuration dictionary passed to the base resource\'s setup() method.'),
105+
tools_module_path: z.string().optional().describe('Optional Python import path to a module containing custom tool functions for this task.'),
106+
reward_function_path: z.string().describe('Python import path to the reward function (e.g., \'my_module.my_reward_func\').'),
107+
goal_description: z.string().optional().describe('A human-readable description of the agent\'s goal for this task.'),
108+
evaluation_criteria: EvaluationCriteriaModelSchema.optional().describe('Criteria used by the Orchestrator to determine if the primary goal was achieved.'),
109+
initial_user_prompt: z.string().optional().describe('The initial prompt or message to start the agent interaction. Deprecated if \'messages\' field is used for multi-turn.'),
110+
messages: z.array(z.record(z.string(), z.any())).optional().describe('A list of messages to start the conversation, can represent multiple user turns for sequential processing.'),
111+
poc_max_turns: z.number().int().min(1).default(3).describe('For PoC Orchestrator, the maximum number of interaction turns.'),
112+
resource_server: ResourceServerConfigSchema.optional().describe('Configuration for a background server required for the task.'),
113+
num_rollouts: z.number().int().min(1).default(1).describe('Number of parallel rollouts to execute for this task definition.'),
114+
dataset_path: z.string().optional().describe('Path to dataset file (JSONL) containing experimental conditions for data-driven evaluation.'),
115+
num_rollouts_per_sample: z.number().int().min(1).default(1).describe('Number of rollouts to execute per sample from the dataset.')
116+
}).loose(); // equivalent to extra="allow" in Pydantic
117+
118+
// MCP Configuration schemas
119+
export const MCPConfigurationServerStdioSchema = z.object({
120+
command: z.string().describe('command to run the MCP server'),
121+
args: z.array(z.string()).default([]).describe('to pass to the command'),
122+
env: z.array(z.string()).default([]).describe('List of environment variables to verify exist in the environment')
123+
});
124+
125+
export const MCPConfigurationServerUrlSchema = z.object({
126+
url: z.string().describe('url to the MCP server')
127+
});
128+
129+
export const MCPMultiClientConfigurationSchema = z.object({
130+
mcpServers: z.record(z.string(), z.union([MCPConfigurationServerStdioSchema, MCPConfigurationServerUrlSchema]))
131+
});
132+
133+
// Export TypeScript types derived from the schemas
134+
export type ChatCompletionContentPartTextParam = z.infer<typeof ChatCompletionContentPartTextParamSchema>;
135+
export type Message = z.infer<typeof MessageSchema>;
136+
export type MetricResult = z.infer<typeof MetricResultSchema>;
137+
export type StepOutput = z.infer<typeof StepOutputSchema>;
138+
export type EvaluateResult = z.infer<typeof EvaluateResultSchema>;
139+
export type CompletionParams = z.infer<typeof CompletionParamsSchema>;
140+
export type InputMetadata = z.infer<typeof InputMetadataSchema>;
141+
export type CompletionUsage = z.infer<typeof CompletionUsageSchema>;
142+
export type EvaluationRow = z.infer<typeof EvaluationRowSchema>;
143+
export type ResourceServerConfig = z.infer<typeof ResourceServerConfigSchema>;
144+
export type EvaluationCriteriaModel = z.infer<typeof EvaluationCriteriaModelSchema>;
145+
export type TaskDefinitionModel = z.infer<typeof TaskDefinitionModelSchema>;
146+
export type MCPConfigurationServerStdio = z.infer<typeof MCPConfigurationServerStdioSchema>;
147+
export type MCPConfigurationServerUrl = z.infer<typeof MCPConfigurationServerUrlSchema>;
148+
export type MCPMultiClientConfiguration = z.infer<typeof MCPMultiClientConfigurationSchema>;

0 commit comments

Comments
 (0)