|
| 1 | +import { z } from 'zod'; |
| 2 | + |
| 3 | + |
| 4 | +// Base schemas |
| 5 | +export const ChatCompletionContentPartTextParamSchema = z.object({ |
| 6 | + text: z.string().describe('The text content.'), |
| 7 | + type: z.literal('text').default('text').describe('The type of the content part.') |
| 8 | +}); |
| 9 | + |
| 10 | +export const FunctionCallSchema = z.object({ |
| 11 | + name: z.string(), |
| 12 | + arguments: z.string() |
| 13 | +}); |
| 14 | + |
| 15 | +export const ChatCompletionMessageToolCallSchema = z.object({ |
| 16 | + id: z.string(), |
| 17 | + type: z.literal('function'), |
| 18 | + function: FunctionCallSchema |
| 19 | +}); |
| 20 | + |
| 21 | +export const MessageSchema = z.object({ |
| 22 | + role: z.string().describe('assistant, user, system, tool'), |
| 23 | + content: z.union([z.string(), z.array(ChatCompletionContentPartTextParamSchema)]).optional().default('').describe('The content of the message.'), |
| 24 | + name: z.string().optional(), |
| 25 | + tool_call_id: z.string().optional(), |
| 26 | + tool_calls: z.array(ChatCompletionMessageToolCallSchema).optional(), |
| 27 | + function_call: FunctionCallSchema.optional(), |
| 28 | + control_plane_step: z.record(z.string(), z.any()).optional() |
| 29 | +}); |
| 30 | + |
| 31 | +export const MetricResultSchema = z.object({ |
| 32 | + is_score_valid: z.boolean().default(true), |
| 33 | + score: z.number().min(0.0).max(1.0), |
| 34 | + reason: z.string() |
| 35 | +}); |
| 36 | + |
| 37 | +export const StepOutputSchema = z.object({ |
| 38 | + step_index: z.union([z.number(), z.string()]).describe('User-defined index for the step (e.g., assistant message index, turn number). This is used by the system to map this output to the internal StepData.'), |
| 39 | + base_reward: z.number().describe('Base reward calculated by the user\'s reward function for this step.'), |
| 40 | + terminated: z.boolean().default(false).describe('Whether the environment signaled termination at this step.'), |
| 41 | + control_plane_info: z.record(z.string(), z.any()).optional().describe('Structured info from the environment\'s control plane.'), |
| 42 | + metrics: z.record(z.string(), z.any()).default({}).describe('Optional dictionary of custom metrics for this step.'), |
| 43 | + reason: z.string().optional().describe('Optional explanation for the step\'s base reward or metrics.') |
| 44 | +}); |
| 45 | + |
| 46 | +export const EvaluateResultSchema = z.object({ |
| 47 | + score: z.number().describe('The overall evaluation score, typically between 0.0 and 1.0.'), |
| 48 | + is_score_valid: z.boolean().default(true).describe('Whether the overall score is valid.'), |
| 49 | + reason: z.string().optional().describe('Optional explanation for the overall score.'), |
| 50 | + metrics: z.record(z.string(), MetricResultSchema).default({}).describe('Dictionary of component metrics for detailed breakdown.'), |
| 51 | + step_outputs: z.array(StepOutputSchema).optional().describe('For RL, a list of outputs for each conceptual step, providing base rewards.'), |
| 52 | + error: z.string().optional().describe('Optional error message if the evaluation itself encountered an issue.'), |
| 53 | + trajectory_info: z.record(z.string(), z.any()).optional().describe('Additional trajectory-level information (duration, steps, termination_reason, etc.).'), |
| 54 | + final_control_plane_info: z.record(z.string(), z.any()).optional().describe('The final control plane state that led to termination.') |
| 55 | +}); |
| 56 | + |
| 57 | +export const CompletionParamsSchema = z.object({ |
| 58 | + model: z.string().describe('Model identifier (e.g., \'gpt-4.1\', \'fireworks/llama\')'), |
| 59 | + temperature: z.number().optional().describe('Temperature setting for model generation'), |
| 60 | + max_tokens: z.number().optional().describe('Maximum tokens to generate'), |
| 61 | + max_tool_calls: z.number().optional().describe('Maximum tool calls per turn') |
| 62 | +}); |
| 63 | + |
| 64 | +export const InputMetadataSchema = z.object({ |
| 65 | + row_id: z.string().describe('Unique string to ID the row'), |
| 66 | + completion_params: CompletionParamsSchema.optional().describe('Completion endpoint parameters used'), |
| 67 | + dataset_info: z.record(z.string(), z.any()).optional().describe('Dataset row details: seed, system_prompt, environment_context, etc'), |
| 68 | + session_data: z.record(z.string(), z.any()).optional().describe('Session metadata like timestamp (input only, no duration/usage)') |
| 69 | +}).loose(); |
| 70 | + |
| 71 | +export const CompletionUsageSchema = z.object({ |
| 72 | + prompt_tokens: z.number(), |
| 73 | + completion_tokens: z.number(), |
| 74 | + total_tokens: z.number() |
| 75 | +}); |
| 76 | + |
| 77 | +export const EvaluationRowSchema = z.object({ |
| 78 | + messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'), |
| 79 | + tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'), |
| 80 | + input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'), |
| 81 | + ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'), |
| 82 | + evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'), |
| 83 | + usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'), |
| 84 | + created_at: z.date().default(() => new Date()).describe('The timestamp when the row was created.') |
| 85 | +}); |
| 86 | + |
| 87 | +// Agent Evaluation Framework (V2) schemas |
| 88 | +export const ResourceServerConfigSchema = z.object({ |
| 89 | + start_command: z.string().describe('The command to start the server. The string \'{port}\' will be replaced with a dynamically allocated free port.'), |
| 90 | + health_check_url: z.string().describe('The URL to poll to check if the server is ready. The string \'{port}\' will be replaced with the allocated port.') |
| 91 | +}); |
| 92 | + |
| 93 | +export const EvaluationCriteriaModelSchema = z.object({ |
| 94 | + final_state_query: z.string().optional().describe('A query (e.g., SQL) to run on the final state of the resource.'), |
| 95 | + expected_query_result_transform: z.string().optional().describe('A Python lambda string (e.g., \'lambda x: x > 0\') to transform and evaluate the query result to a boolean.'), |
| 96 | + ground_truth_function_calls: z.array(z.array(z.string())).optional().describe('Ground truth function calls for BFCL evaluation.'), |
| 97 | + ground_truth_comparable_state: z.record(z.string(), z.any()).optional().describe('Ground truth comparable state for BFCL evaluation.') |
| 98 | +}); |
| 99 | + |
| 100 | +export const TaskDefinitionModelSchema = z.object({ |
| 101 | + name: z.string().describe('Unique name for the task.'), |
| 102 | + description: z.string().optional().describe('A brief description of the task.'), |
| 103 | + resource_type: z.string().describe('The type of ForkableResource to use (e.g., \'SQLResource\', \'PythonStateResource\', \'FileSystemResource\', \'DockerResource\').'), |
| 104 | + base_resource_config: z.record(z.string(), z.any()).default({}).describe('Configuration dictionary passed to the base resource\'s setup() method.'), |
| 105 | + tools_module_path: z.string().optional().describe('Optional Python import path to a module containing custom tool functions for this task.'), |
| 106 | + reward_function_path: z.string().describe('Python import path to the reward function (e.g., \'my_module.my_reward_func\').'), |
| 107 | + goal_description: z.string().optional().describe('A human-readable description of the agent\'s goal for this task.'), |
| 108 | + evaluation_criteria: EvaluationCriteriaModelSchema.optional().describe('Criteria used by the Orchestrator to determine if the primary goal was achieved.'), |
| 109 | + initial_user_prompt: z.string().optional().describe('The initial prompt or message to start the agent interaction. Deprecated if \'messages\' field is used for multi-turn.'), |
| 110 | + messages: z.array(z.record(z.string(), z.any())).optional().describe('A list of messages to start the conversation, can represent multiple user turns for sequential processing.'), |
| 111 | + poc_max_turns: z.number().int().min(1).default(3).describe('For PoC Orchestrator, the maximum number of interaction turns.'), |
| 112 | + resource_server: ResourceServerConfigSchema.optional().describe('Configuration for a background server required for the task.'), |
| 113 | + num_rollouts: z.number().int().min(1).default(1).describe('Number of parallel rollouts to execute for this task definition.'), |
| 114 | + dataset_path: z.string().optional().describe('Path to dataset file (JSONL) containing experimental conditions for data-driven evaluation.'), |
| 115 | + num_rollouts_per_sample: z.number().int().min(1).default(1).describe('Number of rollouts to execute per sample from the dataset.') |
| 116 | +}).loose(); // equivalent to extra="allow" in Pydantic |
| 117 | + |
| 118 | +// MCP Configuration schemas |
| 119 | +export const MCPConfigurationServerStdioSchema = z.object({ |
| 120 | + command: z.string().describe('command to run the MCP server'), |
| 121 | + args: z.array(z.string()).default([]).describe('to pass to the command'), |
| 122 | + env: z.array(z.string()).default([]).describe('List of environment variables to verify exist in the environment') |
| 123 | +}); |
| 124 | + |
| 125 | +export const MCPConfigurationServerUrlSchema = z.object({ |
| 126 | + url: z.string().describe('url to the MCP server') |
| 127 | +}); |
| 128 | + |
| 129 | +export const MCPMultiClientConfigurationSchema = z.object({ |
| 130 | + mcpServers: z.record(z.string(), z.union([MCPConfigurationServerStdioSchema, MCPConfigurationServerUrlSchema])) |
| 131 | +}); |
| 132 | + |
| 133 | +// Export TypeScript types derived from the schemas |
| 134 | +export type ChatCompletionContentPartTextParam = z.infer<typeof ChatCompletionContentPartTextParamSchema>; |
| 135 | +export type Message = z.infer<typeof MessageSchema>; |
| 136 | +export type MetricResult = z.infer<typeof MetricResultSchema>; |
| 137 | +export type StepOutput = z.infer<typeof StepOutputSchema>; |
| 138 | +export type EvaluateResult = z.infer<typeof EvaluateResultSchema>; |
| 139 | +export type CompletionParams = z.infer<typeof CompletionParamsSchema>; |
| 140 | +export type InputMetadata = z.infer<typeof InputMetadataSchema>; |
| 141 | +export type CompletionUsage = z.infer<typeof CompletionUsageSchema>; |
| 142 | +export type EvaluationRow = z.infer<typeof EvaluationRowSchema>; |
| 143 | +export type ResourceServerConfig = z.infer<typeof ResourceServerConfigSchema>; |
| 144 | +export type EvaluationCriteriaModel = z.infer<typeof EvaluationCriteriaModelSchema>; |
| 145 | +export type TaskDefinitionModel = z.infer<typeof TaskDefinitionModelSchema>; |
| 146 | +export type MCPConfigurationServerStdio = z.infer<typeof MCPConfigurationServerStdioSchema>; |
| 147 | +export type MCPConfigurationServerUrl = z.infer<typeof MCPConfigurationServerUrlSchema>; |
| 148 | +export type MCPMultiClientConfiguration = z.infer<typeof MCPMultiClientConfigurationSchema>; |
0 commit comments