Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion frontend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dist/
static/tsdocs/
static/ui_builder/
ui_builder.tar.gz
ui_builder_serve/
ui_builder_serve/
src/lib/components/copilot/chat/flow/__tests__/eval/results/
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import OpenAI from 'openai'

export interface EvalComparisonResult {
success: boolean
resemblanceScore: number
statement: string
missingRequirements?: string[]
error?: string
}

interface ExpectedFlow {
summary?: string
value: {
modules: unknown[]
}
schema?: unknown
}

const EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill flow definitions. Your task is to evaluate a generated flow against:
1. The original user request/prompt
2. An expected reference flow

## Windmill Flow Context
- Flows consist of modules (steps) that execute sequentially
- Module types include: rawscript, forloopflow, branchone, branchall, script, flow, aiagent
- Each module has an id, value (containing type and config), and may have input_transforms
- input_transforms connect modules using expressions like "results.previous_step". Valid input_transforms are: static, javascript. Valid variables in javascript expressions are: results, flow_input, flow_input.iter.value (for forloopflow), flow_input.iter.index (for forloopflow).
- forloopflow contains nested modules that execute per iteration with access to flow_input.iter.value
- branchone executes first matching branch, branchall executes all matching branches
- Branches have conditional expressions (expr) that determine execution
- aiagent modules contain tools array with tool definitions

## Evaluation Criteria
1. **User Request Fulfillment**: Does the generated flow address ALL requirements from the user's original prompt?
- Are all requested steps present?
- Are the requested features implemented (loops, branches, specific logic)?
- Does the schema match what the user requested for inputs?
2. **Structure**: Are the module types and nesting structure appropriate for the task?
3. **Logic**: Does the flow accomplish the intended logical task?
4. **Connections**: Are input_transforms connecting data correctly between steps?
5. **Completeness**: Are all required steps present with no major omissions?
6. **Code Quality**: Is the code functionally correct (exact syntax doesn't need to match)?

## Important Notes
- Minor differences in variable names, code formatting, or exact wording are acceptable
- Focus on functional equivalence, not character-by-character matching
- The generated flow should achieve the same outcome as described in the user request
- Extra helper steps or slightly different approaches can still score high if they accomplish the goal
- If the user requested specific module types (like aiagent), verify they are used correctly

## Response Format
You MUST respond with valid JSON only, no additional text:
{
"resemblanceScore": <0-100 integer>,
"statement": "<brief 1-2 sentence summary of how well the flow matches the user request and expected flow>",
"missingRequirements": ["<list any requirements from user prompt that are missing or incorrectly implemented>"]
}

Score guidelines:
- 90-100: Fully addresses user request, functionally equivalent to expected flow
- 70-89: Addresses most user requirements, same overall structure with minor differences
- 50-69: Partially addresses user request, achieves similar goal but different approach
- 30-49: Missing significant requirements from user request
- 0-29: Does not address user request or significantly incorrect`

/**
* Evaluates how well a generated flow matches an expected flow and user request using an LLM.
* Returns a resemblance score (0-100), a qualitative statement, and any missing requirements.
*/
export async function evaluateFlowComparison(
generatedFlow: ExpectedFlow,
expectedFlow: ExpectedFlow,
userPrompt: string
): Promise<EvalComparisonResult> {
const model = 'anthropic/claude-sonnet-4.5'
// @ts-ignore
const apiKey = process.env.OPENROUTER_API_KEY
const client = new OpenAI({ baseURL: 'https://openrouter.ai/api/v1', apiKey })

const userMessage = `## User's Original Request
${userPrompt}

## Expected Reference Flow
\`\`\`json
${JSON.stringify(expectedFlow, null, 2)}
\`\`\`

## Generated Flow
\`\`\`json
${JSON.stringify(generatedFlow, null, 2)}
\`\`\`

Please evaluate how well the generated flow:
1. Fulfills ALL requirements from the user's original request
2. Matches the structure and logic of the expected reference flow`

try {
const response = await client.chat.completions.create({
model,
messages: [
{ role: 'system', content: EVALUATOR_SYSTEM_PROMPT },
{ role: 'user', content: userMessage }
],
temperature: 0
})

const content = response.choices[0]?.message?.content
if (!content) {
return {
success: false,
resemblanceScore: 0,
statement: 'No response from evaluator',
error: 'Empty response from LLM'
}
}

// Parse JSON response - handle potential markdown code blocks
let jsonContent = content.trim()
if (jsonContent.startsWith('```')) {
// Remove markdown code block wrapper
jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '')
}

const parsed = JSON.parse(jsonContent) as {
resemblanceScore: number
statement: string
missingRequirements?: string[]
}

return {
success: true,
resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))),
statement: parsed.statement,
missingRequirements: parsed.missingRequirements ?? []
}
} catch (err) {
const errorMessage = err instanceof Error ? err.message : String(err)
return {
success: false,
resemblanceScore: 0,
statement: 'Evaluation failed',
error: errorMessage
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import type { FlowAIChatHelpers } from '../../core'
import type { FlowModule, InputTransform } from '$lib/gen'
import type { ExtendedOpenFlow } from '$lib/components/flows/types'
import { findModuleById } from '../../../shared'
import { inlineScriptStore, restoreInlineScriptReferences } from '../../inlineScriptsUtils'

/**
* Creates mock FlowAIChatHelpers for eval testing.
* Tracks flow state in memory and allows tool functions to modify it.
*/
export function createEvalHelpers(
initialModules: FlowModule[] = [],
initialSchema?: Record<string, any>
) {
let flow: ExtendedOpenFlow = {
value: { modules: structuredClone(initialModules) },
summary: '',
schema: initialSchema ?? {
$schema: 'https://json-schema.org/draft/2020-12/schema',
properties: {},
required: [],
type: 'object'
}
}

const helpers: FlowAIChatHelpers = {
getFlowAndSelectedId: () => ({ flow, selectedId: '' }),

getModules: (id?: string) => {
if (!id) return flow.value.modules
const module = findModuleById(flow.value.modules, id)
return module ? [module] : []
},

setSnapshot: () => {
// No-op for eval - we don't need snapshot tracking
},

revertToSnapshot: () => {
// No-op for eval
},

setCode: async (id: string, code: string) => {
const module = findModuleById(flow.value.modules, id)
if (module && module.value.type === 'rawscript') {
module.value.content = code
}
// Keep store coherent for subsequent set_flow_json calls with references
inlineScriptStore.set(id, code)
},

setFlowJson: async (json: string) => {
const parsed = JSON.parse(json)

// Restore inline script references back to full content (mirrors FlowAIChat.svelte)
if (parsed.modules && Array.isArray(parsed.modules)) {
parsed.modules = restoreInlineScriptReferences(parsed.modules)
}

flow.value = { ...flow.value, ...parsed }
// Also update schema if provided
if (parsed.schema !== undefined) {
flow.schema = parsed.schema
}
},

getFlowInputsSchema: async () => flow.schema ?? {},

updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {
// No-op for eval - UI-only functionality
},

acceptAllModuleActions: () => {
// No-op for eval
},

rejectAllModuleActions: () => {
// No-op for eval
},

hasPendingChanges: () => false,

selectStep: (_id: string) => {
// No-op for eval
},

testFlow: async () => {
// Return mock job ID - we don't actually run flows in eval
return 'mock-job-id-' + Date.now()
}
}

return {
helpers,
getFlow: () => flow,
getModules: () => flow.value.modules
}
}
Loading