diff --git a/frontend/.gitignore b/frontend/.gitignore
index bc35cf2ec3557..f541fa8387df4 100644
--- a/frontend/.gitignore
+++ b/frontend/.gitignore
@@ -13,4 +13,5 @@ dist/
 static/tsdocs/
 static/ui_builder/
 ui_builder.tar.gz
-ui_builder_serve/
\ No newline at end of file
+ui_builder_serve/
+src/lib/components/copilot/chat/flow/__tests__/eval/results/
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalFlowComparison.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalFlowComparison.ts
new file mode 100644
index 0000000000000..fcb8c317d45d1
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalFlowComparison.ts
@@ -0,0 +1,145 @@
+import OpenAI from 'openai'
+
+export interface EvalComparisonResult {
+	success: boolean
+	resemblanceScore: number
+	statement: string
+	missingRequirements?: string[]
+	error?: string
+}
+
+interface ExpectedFlow {
+	summary?: string
+	value: {
+		modules: unknown[]
+	}
+	schema?: unknown
+}
+
+const EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill flow definitions. Your task is to evaluate a generated flow against:
+1. The original user request/prompt
+2. An expected reference flow
+
+## Windmill Flow Context
+- Flows consist of modules (steps) that execute sequentially
+- Module types include: rawscript, forloopflow, branchone, branchall, script, flow, aiagent
+- Each module has an id, value (containing type and config), and may have input_transforms
+- input_transforms connect modules using expressions like "results.previous_step". Valid input_transforms are: static, javascript. Valid variables in javascript expressions are: results, flow_input, flow_input.iter.value (for forloopflow), flow_input.iter.index (for forloopflow).
+- forloopflow contains nested modules that execute per iteration with access to flow_input.iter.value
+- branchone executes first matching branch, branchall executes all matching branches
+- Branches have conditional expressions (expr) that determine execution
+- aiagent modules contain tools array with tool definitions
+
+## Evaluation Criteria
+1. **User Request Fulfillment**: Does the generated flow address ALL requirements from the user's original prompt?
+   - Are all requested steps present?
+   - Are the requested features implemented (loops, branches, specific logic)?
+   - Does the schema match what the user requested for inputs?
+2. **Structure**: Are the module types and nesting structure appropriate for the task?
+3. **Logic**: Does the flow accomplish the intended logical task?
+4. **Connections**: Are input_transforms connecting data correctly between steps?
+5. **Completeness**: Are all required steps present with no major omissions?
+6. **Code Quality**: Is the code functionally correct (exact syntax doesn't need to match)?
+
+## Important Notes
+- Minor differences in variable names, code formatting, or exact wording are acceptable
+- Focus on functional equivalence, not character-by-character matching
+- The generated flow should achieve the same outcome as described in the user request
+- Extra helper steps or slightly different approaches can still score high if they accomplish the goal
+- If the user requested specific module types (like aiagent), verify they are used correctly
+
+## Response Format
+You MUST respond with valid JSON only, no additional text:
+{
+  "resemblanceScore": <0-100 integer>,
+  "statement": "<brief 1-2 sentence summary of how well the flow matches the user request and expected flow>",
+  "missingRequirements": ["<list any requirements from user prompt that are missing or incorrectly implemented>"]
+}
+
+Score guidelines:
+- 90-100: Fully addresses user request, functionally equivalent to expected flow
+- 70-89: Addresses most user requirements, same overall structure with minor differences
+- 50-69: Partially addresses user request, achieves similar goal but different approach
+- 30-49: Missing significant requirements from user request
+- 0-29: Does not address user request or significantly incorrect`
+
+/**
+ * Evaluates how well a generated flow matches an expected flow and user request using an LLM.
+ * Returns a resemblance score (0-100), a qualitative statement, and any missing requirements.
+ */
+export async function evaluateFlowComparison(
+	generatedFlow: ExpectedFlow,
+	expectedFlow: ExpectedFlow,
+	userPrompt: string
+): Promise<EvalComparisonResult> {
+	const model = 'anthropic/claude-sonnet-4.5'
+	// @ts-ignore
+	const apiKey = process.env.OPENROUTER_API_KEY
+	const client = new OpenAI({ baseURL: 'https://openrouter.ai/api/v1', apiKey })
+
+	const userMessage = `## User's Original Request
+${userPrompt}
+
+## Expected Reference Flow
+\`\`\`json
+${JSON.stringify(expectedFlow, null, 2)}
+\`\`\`
+
+## Generated Flow
+\`\`\`json
+${JSON.stringify(generatedFlow, null, 2)}
+\`\`\`
+
+Please evaluate how well the generated flow:
+1. Fulfills ALL requirements from the user's original request
+2. Matches the structure and logic of the expected reference flow`
+
+	try {
+		const response = await client.chat.completions.create({
+			model,
+			messages: [
+				{ role: 'system', content: EVALUATOR_SYSTEM_PROMPT },
+				{ role: 'user', content: userMessage }
+			],
+			temperature: 0
+		})
+
+		const content = response.choices[0]?.message?.content
+		if (!content) {
+			return {
+				success: false,
+				resemblanceScore: 0,
+				statement: 'No response from evaluator',
+				error: 'Empty response from LLM'
+			}
+		}
+
+		// Parse JSON response - handle potential markdown code blocks
+		let jsonContent = content.trim()
+		if (jsonContent.startsWith('```')) {
+			// Remove markdown code block wrapper
+			jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '')
+		}
+
+		const parsed = JSON.parse(jsonContent) as {
+			resemblanceScore: number
+			statement: string
+			missingRequirements?: string[]
+		}
+
+		return {
+			success: true,
+			resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))),
+			statement: parsed.statement,
+			missingRequirements: parsed.missingRequirements ?? []
+		}
+	} catch (err) {
+		const errorMessage = err instanceof Error ? err.message : String(err)
+		return {
+			success: false,
+			resemblanceScore: 0,
+			statement: 'Evaluation failed',
+			error: errorMessage
+		}
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalHelpers.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalHelpers.ts
new file mode 100644
index 0000000000000..506a540c9588f
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalHelpers.ts
@@ -0,0 +1,98 @@
+import type { FlowAIChatHelpers } from '../../core'
+import type { FlowModule, InputTransform } from '$lib/gen'
+import type { ExtendedOpenFlow } from '$lib/components/flows/types'
+import { findModuleById } from '../../../shared'
+import { inlineScriptStore, restoreInlineScriptReferences } from '../../inlineScriptsUtils'
+
+/**
+ * Creates mock FlowAIChatHelpers for eval testing.
+ * Tracks flow state in memory and allows tool functions to modify it.
+ */
+export function createEvalHelpers(
+	initialModules: FlowModule[] = [],
+	initialSchema?: Record<string, any>
+) {
+	let flow: ExtendedOpenFlow = {
+		value: { modules: structuredClone(initialModules) },
+		summary: '',
+		schema: initialSchema ?? {
+			$schema: 'https://json-schema.org/draft/2020-12/schema',
+			properties: {},
+			required: [],
+			type: 'object'
+		}
+	}
+
+	const helpers: FlowAIChatHelpers = {
+		getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
+
+		getModules: (id?: string) => {
+			if (!id) return flow.value.modules
+			const module = findModuleById(flow.value.modules, id)
+			return module ? [module] : []
+		},
+
+		setSnapshot: () => {
+			// No-op for eval - we don't need snapshot tracking
+		},
+
+		revertToSnapshot: () => {
+			// No-op for eval
+		},
+
+		setCode: async (id: string, code: string) => {
+			const module = findModuleById(flow.value.modules, id)
+			if (module && module.value.type === 'rawscript') {
+				module.value.content = code
+			}
+			// Keep store coherent for subsequent set_flow_json calls with references
+			inlineScriptStore.set(id, code)
+		},
+
+		setFlowJson: async (json: string) => {
+			const parsed = JSON.parse(json)
+
+			// Restore inline script references back to full content (mirrors FlowAIChat.svelte)
+			if (parsed.modules && Array.isArray(parsed.modules)) {
+				parsed.modules = restoreInlineScriptReferences(parsed.modules)
+			}
+
+			flow.value = { ...flow.value, ...parsed }
+			// Also update schema if provided
+			if (parsed.schema !== undefined) {
+				flow.schema = parsed.schema
+			}
+		},
+
+		getFlowInputsSchema: async () => flow.schema ?? {},
+
+		updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {
+			// No-op for eval - UI-only functionality
+		},
+
+		acceptAllModuleActions: () => {
+			// No-op for eval
+		},
+
+		rejectAllModuleActions: () => {
+			// No-op for eval
+		},
+
+		hasPendingChanges: () => false,
+
+		selectStep: (_id: string) => {
+			// No-op for eval
+		},
+
+		testFlow: async () => {
+			// Return mock job ID - we don't actually run flows in eval
+			return 'mock-job-id-' + Date.now()
+		}
+	}
+
+	return {
+		helpers,
+		getFlow: () => flow,
+		getModules: () => flow.value.modules
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalResultsWriter.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalResultsWriter.ts
new file mode 100644
index 0000000000000..edc3150ecd8a2
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalResultsWriter.ts
@@ -0,0 +1,168 @@
+// @ts-ignore
+import { writeFile, mkdir } from 'fs/promises'
+// @ts-ignore
+import { join, dirname } from 'path'
+// @ts-ignore
+import { fileURLToPath } from 'url'
+import type { EvalResult } from './evalRunner'
+
+const __filename = fileURLToPath(import.meta.url)
+const __dirname = dirname(__filename)
+
+/**
+ * Generates a timestamp string suitable for filenames.
+ * Format: 2024-01-15T10-30-45-123Z (ISO but with dashes instead of colons)
+ */
+function generateTimestamp(): string {
+	return new Date().toISOString().replace(/:/g, '-')
+}
+
+/**
+ * Writes comparison results to files in the results folder.
+ * Creates:
+ * - {timestamp}.md - Summary with prompt and results table
+ * - {timestamp}_{variant_name}.json - Flow JSON for each variant
+ */
+export async function writeComparisonResults(
+	userPrompt: string,
+	results: EvalResult[],
+	outputDir?: string
+): Promise<{ summaryPath: string; flowPaths: string[] }> {
+	const resultsDir = outputDir ?? join(__dirname, 'results')
+	const timestamp = generateTimestamp()
+
+	// Ensure results directory exists
+	await mkdir(resultsDir, { recursive: true })
+	const resultFolder = join(resultsDir, timestamp)
+	await mkdir(resultFolder, { recursive: true })
+
+	// Check if any results have evaluation data
+	const hasEvaluation = results.some((r) => r.evaluationResult)
+
+	// Build summary markdown
+	const summaryLines: string[] = [
+		`# Eval Results - ${timestamp}`,
+		'',
+		'## User Prompt',
+		'```',
+		userPrompt.trim(),
+		'```',
+		'',
+		'## Results',
+		''
+	]
+
+	// Add results table header based on whether evaluation data exists
+	if (hasEvaluation) {
+		summaryLines.push(
+			'| Variant | Success | Total Tokens | Tool Calls | Iterations | Resemblance Score |'
+		)
+		summaryLines.push(
+			'|---------|---------|--------------|------------|------------|-------------------|'
+		)
+	} else {
+		summaryLines.push('| Variant | Success | Total Tokens | Tool Calls | Iterations |')
+		summaryLines.push('|---------|---------|--------------|------------|------------|')
+	}
+
+	for (const result of results) {
+		const baseRow = `| ${result.variantName} | ${result.success} | ${result.tokenUsage.total} | ${result.toolsCalled.length} | ${result.iterations}`
+		if (hasEvaluation) {
+			const score = result.evaluationResult?.resemblanceScore ?? 'N/A'
+			summaryLines.push(`${baseRow} | ${score} |`)
+		} else {
+			summaryLines.push(`${baseRow} |`)
+		}
+	}
+
+	// Add evaluation details section if available
+	if (hasEvaluation) {
+		summaryLines.push('')
+		summaryLines.push('## Evaluation Details')
+		summaryLines.push('')
+		for (const result of results) {
+			if (result.evaluationResult) {
+				summaryLines.push(`### ${result.variantName}`)
+				summaryLines.push('')
+				summaryLines.push(`**Score:** ${result.evaluationResult.resemblanceScore}/100`)
+				summaryLines.push('')
+				summaryLines.push(`**Statement:** ${result.evaluationResult.statement}`)
+				summaryLines.push('')
+				if (
+					result.evaluationResult.missingRequirements &&
+					result.evaluationResult.missingRequirements.length > 0
+				) {
+					summaryLines.push('**Missing Requirements:**')
+					for (const req of result.evaluationResult.missingRequirements) {
+						summaryLines.push(`- ${req}`)
+					}
+					summaryLines.push('')
+				}
+				if (result.evaluationResult.error) {
+					summaryLines.push(`**Error:** ${result.evaluationResult.error}`)
+					summaryLines.push('')
+				}
+			}
+		}
+	}
+
+	// Add errors section for failed variants
+	const failedResults = results.filter((r) => !r.success && r.error)
+	if (failedResults.length > 0) {
+		summaryLines.push('')
+		summaryLines.push('## Errors')
+		summaryLines.push('')
+		for (const result of failedResults) {
+			summaryLines.push(`### ${result.variantName}`)
+			summaryLines.push('')
+			summaryLines.push('```')
+			summaryLines.push(result.error!)
+			summaryLines.push('```')
+			summaryLines.push('')
+		}
+	}
+
+	const flowPaths: string[] = []
+
+	for (const result of results) {
+		const resultFilename = `${result.variantName}.json`
+		const resultPath = join(resultFolder, resultFilename)
+		flowPaths.push(resultPath)
+
+		const flowFilename = `${result.variantName}_flow.json`
+		const flowPath = join(resultFolder, flowFilename)
+
+		// Write result JSON file (with metadata)
+		const resultData = {
+			variantName: result.variantName,
+			success: result.success,
+			error: result.error,
+			evaluationResult: result.evaluationResult,
+			toolsCalled: result.toolsCalled,
+			toolCallDetails: result.toolCallDetails,
+			messages: result.messages
+		}
+		await writeFile(resultPath, JSON.stringify(resultData, null, 2))
+
+		// Write flow definition JSON file (clean flow format)
+		const flowData = {
+			summary: result.flow.summary ?? '',
+			value: {
+				modules: result.flow.value.modules
+			},
+			schema: result.flow.schema ?? {
+				$schema: 'https://json-schema.org/draft/2020-12/schema',
+				properties: {},
+				required: [],
+				type: 'object'
+			}
+		}
+		await writeFile(flowPath, JSON.stringify(flowData, null, 2))
+	}
+
+	// Write summary markdown file
+	const summaryPath = join(resultFolder, `summary.md`)
+	await writeFile(summaryPath, summaryLines.join('\n'))
+
+	return { summaryPath, flowPaths }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalRunner.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalRunner.ts
new file mode 100644
index 0000000000000..d739c292943bf
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalRunner.ts
@@ -0,0 +1,272 @@
+import OpenAI, { APIError } from 'openai'
+import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs'
+import { prepareFlowUserMessage } from '../../core'
+import { createEvalHelpers } from './evalHelpers'
+import type { FlowModule } from '$lib/gen'
+import type { ExtendedOpenFlow } from '$lib/components/flows/types'
+import type { ToolCallbacks } from '../../../shared'
+import { type VariantConfig, resolveSystemPrompt, resolveTools, resolveModel } from './evalVariants'
+import { evaluateFlowComparison, type EvalComparisonResult } from './evalFlowComparison'
+
+// Re-export for convenience
+export type { EvalComparisonResult } from './evalFlowComparison'
+
+export interface ToolCallDetail {
+	name: string
+	arguments: Record<string, any>
+}
+
+export interface EvalResult {
+	success: boolean
+	flow: ExtendedOpenFlow
+	error?: string
+	tokenUsage: {
+		prompt: number
+		completion: number
+		total: number
+	}
+	toolCallsCount: number
+	toolsCalled: string[]
+	toolCallDetails: ToolCallDetail[]
+	iterations: number
+	variantName: string
+	evaluationResult?: EvalComparisonResult
+	messages: ChatCompletionMessageParam[]
+}
+
+export interface ExpectedFlow {
+	summary?: string
+	value: {
+		modules: FlowModule[]
+	}
+	schema?: Record<string, any>
+}
+
+export interface EvalOptions {
+	initialModules?: FlowModule[]
+	initialSchema?: Record<string, any>
+	model?: string
+	customSystemPrompt?: string
+	maxIterations?: number
+	variant?: VariantConfig
+	expectedFlow?: ExpectedFlow
+}
+
+/**
+ * Runs a flow chat evaluation with real OpenAI API calls.
+ * Executes tool calls using the actual flowTools from core.ts or variant-configured tools.
+ */
+export async function runFlowEval(
+	userPrompt: string,
+	openaiApiKey: string,
+	options?: EvalOptions
+): Promise<EvalResult> {
+	const client = new OpenAI({ baseURL: 'https://openrouter.ai/api/v1', apiKey: openaiApiKey })
+	const { helpers, getFlow } = createEvalHelpers(
+		options?.initialModules ?? [],
+		options?.initialSchema
+	)
+
+	// Resolve variant configuration
+	const variantName = options?.variant?.name ?? 'baseline'
+	const systemMessage = resolveSystemPrompt(options?.variant, options?.customSystemPrompt)
+	const { toolDefs, tools } = resolveTools(options?.variant)
+	const model = resolveModel(options?.variant, options?.model)
+
+	// Build user message
+	const userMessage = prepareFlowUserMessage(userPrompt, helpers.getFlowAndSelectedId(), [])
+
+	const messages: ChatCompletionMessageParam[] = [systemMessage, userMessage]
+	const totalTokens = { prompt: 0, completion: 0, total: 0 }
+	let toolCallsCount = 0
+	const toolsCalled: string[] = []
+	const toolCallDetails: ToolCallDetail[] = []
+	let iterations = 0
+	const maxIterations = options?.maxIterations ?? 20
+
+	// No-op tool callbacks for eval
+	const toolCallbacks: ToolCallbacks = {
+		setToolStatus: () => {},
+		removeToolStatus: () => {}
+	}
+
+	try {
+		// Tool resolution loop
+		while (iterations < maxIterations) {
+			iterations++
+
+			const response = await client.chat.completions.create({
+				model,
+				messages,
+				tools: toolDefs,
+				temperature: 0
+			})
+
+			// Track token usage
+			if (response.usage) {
+				totalTokens.prompt += response.usage.prompt_tokens
+				totalTokens.completion += response.usage.completion_tokens
+				totalTokens.total += response.usage.total_tokens
+			}
+
+			if (!response.choices.length) {
+				throw new Error('No response from API')
+			}
+
+			const choice = response.choices[0]
+			const assistantMessage = choice.message
+
+			// Add assistant message to history
+			messages.push(assistantMessage)
+
+			// If no tool calls, we're done
+			if (!assistantMessage.tool_calls?.length) {
+				break
+			}
+
+			// Execute each tool call
+			for (const toolCall of assistantMessage.tool_calls) {
+				toolCallsCount++
+
+				// Type guard: only handle function tool calls
+				if (toolCall.type !== 'function') {
+					messages.push({
+						role: 'tool',
+						tool_call_id: toolCall.id,
+						content: `Unsupported tool type: ${toolCall.type}`
+					})
+					continue
+				}
+
+				toolsCalled.push(toolCall.function.name)
+
+				const tool = tools.find((t) => t.def.function.name === toolCall.function.name)
+				if (!tool) {
+					messages.push({
+						role: 'tool',
+						tool_call_id: toolCall.id,
+						content: `Unknown tool: ${toolCall.function.name}`
+					})
+					continue
+				}
+
+				try {
+					const args = JSON.parse(toolCall.function.arguments)
+					toolCallDetails.push({ name: toolCall.function.name, arguments: args })
+					const result = await tool.fn({
+						args,
+						workspace: 'test-workspace',
+						helpers,
+						toolCallbacks,
+						toolId: toolCall.id
+					})
+					messages.push({
+						role: 'tool',
+						tool_call_id: toolCall.id,
+						content: result
+					})
+				} catch (err) {
+					const errorMessage = err instanceof Error ? err.message : String(err)
+					messages.push({
+						role: 'tool',
+						tool_call_id: toolCall.id,
+						content: `Error: ${errorMessage}`
+					})
+				}
+			}
+		}
+
+		// Run evaluation if expected flow is provided
+		let evaluationResult: EvalComparisonResult | undefined
+		if (options?.expectedFlow) {
+			const generatedFlow = getFlow()
+			evaluationResult = await evaluateFlowComparison(
+				generatedFlow,
+				options.expectedFlow,
+				userPrompt
+			)
+		}
+
+		return {
+			success: true,
+			flow: getFlow(),
+			tokenUsage: totalTokens,
+			toolCallsCount,
+			toolsCalled,
+			toolCallDetails,
+			iterations,
+			variantName,
+			evaluationResult,
+			messages
+		}
+	} catch (err) {
+		// Build detailed error message
+		let errorMessage: string
+		if (err instanceof APIError) {
+			const details: string[] = [`${err.status} ${err.message}`]
+			if (err.code) details.push(`Code: ${err.code}`)
+			if (err.type) details.push(`Type: ${err.type}`)
+			if (err.param) details.push(`Param: ${err.param}`)
+			if (err.requestID) details.push(`Request ID: ${err.requestID}`)
+			if (err.error && typeof err.error === 'object') {
+				details.push(`Response: ${JSON.stringify(err.error, null, 2)}`)
+			}
+			errorMessage = details.join('\n')
+		} else if (err instanceof Error) {
+			errorMessage = err.stack ?? err.message
+		} else {
+			errorMessage = String(err)
+		}
+
+		// Still run evaluation on partial content if expected flow is provided
+		let evaluationResult: EvalComparisonResult | undefined
+		if (options?.expectedFlow) {
+			try {
+				const generatedFlow = getFlow()
+				evaluationResult = await evaluateFlowComparison(
+					generatedFlow,
+					options.expectedFlow,
+					userPrompt
+				)
+			} catch (evalErr) {
+				// If evaluation itself fails, just log it and continue
+				console.error('Evaluation failed:', evalErr)
+			}
+		}
+
+		return {
+			success: false,
+			flow: getFlow(),
+			error: errorMessage,
+			tokenUsage: totalTokens,
+			toolCallsCount,
+			toolsCalled,
+			toolCallDetails,
+			iterations,
+			variantName,
+			evaluationResult,
+			messages
+		}
+	}
+}
+
+/**
+ * Runs the same prompt against multiple variants sequentially for comparison.
+ * Returns results in the same order as the input variants.
+ */
+export async function runVariantComparison(
+	userPrompt: string,
+	variants: VariantConfig[],
+	openaiApiKey: string,
+	baseOptions?: Omit<EvalOptions, 'variant'>
+): Promise<EvalResult[]> {
+	const results: EvalResult[] = await Promise.all(
+		variants.map(async (variant) => {
+			return await runFlowEval(userPrompt, openaiApiKey, {
+				...baseOptions,
+				variant
+			})
+		})
+	)
+	return results
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalVariants.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalVariants.ts
new file mode 100644
index 0000000000000..f0bcd02f6656c
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/evalVariants.ts
@@ -0,0 +1,97 @@
+import type { ChatCompletionSystemMessageParam } from 'openai/resources/chat/completions.mjs'
+import type { ChatCompletionTool } from 'openai/resources/chat/completions.mjs'
+import { flowTools, prepareFlowSystemMessage } from '../../core'
+import type { Tool } from '../../../shared'
+import type { FlowAIChatHelpers } from '../../core'
+
+/**
+ * Configuration for a variant in eval testing.
+ * Allows customizing system prompt, tools, and model for comparison.
+ */
+export interface VariantConfig {
+	name: string
+	description?: string
+
+	/** System prompt configuration */
+	systemPrompt?:
+		| { type: 'default' }
+		| { type: 'default-with-custom'; custom: string }
+		| { type: 'custom'; content: string }
+
+	/** Tools configuration */
+	tools?:
+		| { type: 'default' }
+		| { type: 'subset'; include: string[] }
+		| { type: 'custom'; tools: Tool<FlowAIChatHelpers>[] }
+
+	/** Model to use (default: 'gpt-4o') */
+	model?: string
+}
+
+/**
+ * Resolves system prompt from variant config.
+ * Returns the appropriate ChatCompletionSystemMessageParam based on config.
+ */
+export function resolveSystemPrompt(
+	variant?: VariantConfig,
+	fallbackCustomPrompt?: string
+): ChatCompletionSystemMessageParam {
+	if (!variant?.systemPrompt || variant.systemPrompt.type === 'default') {
+		return prepareFlowSystemMessage(fallbackCustomPrompt)
+	}
+
+	if (variant.systemPrompt.type === 'default-with-custom') {
+		return prepareFlowSystemMessage(variant.systemPrompt.custom)
+	}
+
+	// type === 'custom'
+	return {
+		role: 'system',
+		content: variant.systemPrompt.content
+	}
+}
+
+/**
+ * Resolves tools from variant config.
+ * Returns both the tool definitions (for API) and full tools (for execution).
+ */
+export function resolveTools(variant?: VariantConfig): {
+	toolDefs: ChatCompletionTool[]
+	tools: Tool<FlowAIChatHelpers>[]
+} {
+	if (!variant?.tools || variant.tools.type === 'default') {
+		return {
+			toolDefs: flowTools.map((t) => t.def),
+			tools: flowTools
+		}
+	}
+
+	if (variant.tools.type === 'subset') {
+		const subset = flowTools.filter(
+			(t) =>
+				variant.tools!.type === 'subset' &&
+				(variant.tools as { type: 'subset'; include: string[] }).include.includes(
+					t.def.function.name
+				)
+		)
+		return {
+			toolDefs: subset.map((t) => t.def),
+			tools: subset
+		}
+	}
+
+	// type === 'custom'
+	const customTools = variant.tools.tools
+
+	return {
+		toolDefs: customTools.map((t) => t.def),
+		tools: customTools
+	}
+}
+
+/**
+ * Resolves model from variant config with fallback.
+ */
+export function resolveModel(variant?: VariantConfig, fallback?: string): string {
+	return variant?.model ?? fallback ?? 'gpt-4o'
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test1.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test1.json
new file mode 100644
index 0000000000000..bc0db45469d8c
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test1.json
@@ -0,0 +1,134 @@
+{
+    "summary": "",
+    "value": {
+        "modules": [
+            {
+                "id": "fetch_users",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  return [\n    { id: 1, name: \"Alice\", role: \"admin\", active: true },\n    { id: 2, name: \"Bob\", role: \"user\", active: false },\n    { id: 4, name: \"Dana\", role: \"moderator\", active: true },\n    { id: 3, name: \"Charlie\", role: \"user\", active: true },\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "filter_active_users",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(users) {\n  return users.filter(user => user.active);\n}",
+                    "input_transforms": {
+                        "users": {
+                            "type": "javascript",
+                            "expr": "results.fetch_users"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "loop_users",
+                "value": {
+                    "type": "forloopflow",
+                    "iterator": {
+                        "type": "javascript",
+                        "expr": "results.filter_active_users"
+                    },
+                    "skip_failures": false,
+                    "modules": [
+                        {
+                            "id": "branch_user_role",
+                            "value": {
+                                "type": "branchone",
+                                "branches": [
+                                    {
+                                        "summary": "Admin Action",
+                                        "expr": "flow_input.iter.value.role === 'admin'",
+                                        "modules": [
+                                            {
+                                                "id": "admin_action",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(user) {\n  return `Admin action taken for ${user.name}`;\n}",
+                                                    "input_transforms": {
+                                                        "user": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    },
+                                    {
+                                        "summary": "User Action",
+                                        "expr": "flow_input.iter.value.role === 'user'",
+                                        "modules": [
+                                            {
+                                                "id": "user_action",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(user) {\n  return `User action taken for ${user.name}`;\n}",
+                                                    "input_transforms": {
+                                                        "user": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    },
+                                    {
+                                        "summary": "Moderator Action",
+                                        "expr": "flow_input.iter.value.role === 'moderator'",
+                                        "modules": [
+                                            {
+                                                "id": "moderator_action",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(user) { return `Moderator action taken for ${user.name}`; }",
+                                                    "input_transforms": {
+                                                        "user": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    }
+                                ],
+                                "default": []
+                            }
+                        }
+                    ],
+                    "parallel": false,
+                    "squash": false
+                }
+            },
+            {
+                "id": "return_actions",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(actions) {\n  return actions;\n}",
+                    "input_transforms": {
+                        "actions": {
+                            "type": "javascript",
+                            "expr": "results.loop_users"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {},
+        "required": [],
+        "type": "object"
+    }
+}
\ No newline at end of file
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test2.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test2.json
new file mode 100644
index 0000000000000..2cf2a5e5fa229
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test2.json
@@ -0,0 +1,183 @@
+{
+    "summary": "E-commerce Order Processing Pipeline",
+    "value": {
+        "modules": [
+            {
+                "id": "validate_order",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(items: { name: string; price: number; quantity: number }[]) {\n  const invalid = items.filter(item => item.price <= 0 || item.quantity <= 0);\n  return { valid: invalid.length === 0, invalidItems: invalid };\n}",
+                    "input_transforms": {
+                        "items": {
+                            "type": "javascript",
+                            "expr": "flow_input.items"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "calculate_total",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(items: { name: string; price: number; quantity: number }[]) {\n  const subtotal = items.reduce((sum, item) => sum + item.price * item.quantity, 0);\n  const tax = subtotal * 0.08;\n  return { subtotal, tax, total: subtotal + tax };\n}",
+                    "input_transforms": {
+                        "items": {
+                            "type": "javascript",
+                            "expr": "flow_input.items"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "check_inventory",
+                "value": {
+                    "type": "forloopflow",
+                    "iterator": {
+                        "type": "javascript",
+                        "expr": "flow_input.items"
+                    },
+                    "skip_failures": false,
+                    "parallel": true,
+                    "modules": [
+                        {
+                            "id": "check_item_stock",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(item: { name: string; quantity: number }) {\n  // Mock inventory check - items with even quantity are in stock\n  const inStock = item.quantity % 2 === 0 || item.quantity < 5;\n  return { name: item.name, requested: item.quantity, inStock, available: inStock ? item.quantity : 0 };\n}",
+                                "input_transforms": {
+                                    "item": {
+                                        "type": "javascript",
+                                        "expr": "flow_input.iter.value"
+                                    }
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "process_inventory_result",
+                "value": {
+                    "type": "branchone",
+                    "branches": [
+                        {
+                            "summary": "All items in stock - create shipment",
+                            "expr": "results.check_inventory.every(item => item.inStock)",
+                            "modules": [
+                                {
+                                    "id": "create_shipment",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(shipping_address: string, total: number) {\n  return {\n    shipment_id: `SHIP-${Date.now()}`,\n    status: 'created',\n    address: shipping_address,\n    total,\n    estimated_delivery: '3-5 business days'\n  };\n}",
+                                        "input_transforms": {
+                                            "shipping_address": {
+                                                "type": "javascript",
+                                                "expr": "flow_input.shipping_address"
+                                            },
+                                            "total": {
+                                                "type": "javascript",
+                                                "expr": "results.calculate_total.total"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    "default": [
+                        {
+                            "id": "create_backorder",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(inventory_results: { name: string; inStock: boolean }[]) {\n  const outOfStock = inventory_results.filter(item => !item.inStock);\n  return {\n    backorder_id: `BO-${Date.now()}`,\n    status: 'backorder',\n    unavailable_items: outOfStock.map(item => item.name),\n    message: 'Some items are out of stock. We will notify you when available.'\n  };\n}",
+                                "input_transforms": {
+                                    "inventory_results": {
+                                        "type": "javascript",
+                                        "expr": "results.check_inventory"
+                                    }
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "send_confirmation",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(customer_email: string, order_result: any) {\n  // Mock email sending\n  return {\n    email_sent: true,\n    to: customer_email,\n    subject: order_result.shipment_id ? 'Order Confirmed' : 'Order Update - Backorder',\n    sent_at: new Date().toISOString()\n  };\n}",
+                    "input_transforms": {
+                        "customer_email": {
+                            "type": "javascript",
+                            "expr": "flow_input.customer_email"
+                        },
+                        "order_result": {
+                            "type": "javascript",
+                            "expr": "results.process_inventory_result"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "return_summary",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(validation: any, totals: any, order_result: any, confirmation: any) {\n  return {\n    order_valid: validation.valid,\n    totals,\n    order_status: order_result.shipment_id ? 'shipped' : 'backorder',\n    order_id: order_result.shipment_id || order_result.backorder_id,\n    confirmation_sent: confirmation.email_sent\n  };\n}",
+                    "input_transforms": {
+                        "validation": {
+                            "type": "javascript",
+                            "expr": "results.validate_order"
+                        },
+                        "totals": {
+                            "type": "javascript",
+                            "expr": "results.calculate_total"
+                        },
+                        "order_result": {
+                            "type": "javascript",
+                            "expr": "results.process_inventory_result"
+                        },
+                        "confirmation": {
+                            "type": "javascript",
+                            "expr": "results.send_confirmation"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "type": "object",
+        "properties": {
+            "items": {
+                "type": "array",
+                "description": "Array of order items",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "name": { "type": "string" },
+                        "price": { "type": "number" },
+                        "quantity": { "type": "integer" }
+                    },
+                    "required": ["name", "price", "quantity"]
+                }
+            },
+            "customer_email": {
+                "type": "string",
+                "description": "Customer email address"
+            },
+            "shipping_address": {
+                "type": "string",
+                "description": "Shipping address"
+            }
+        },
+        "required": ["items", "customer_email", "shipping_address"]
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test3.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test3.json
new file mode 100644
index 0000000000000..28e4b7981f378
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test3.json
@@ -0,0 +1,204 @@
+{
+    "summary": "Data Pipeline with Quality-Based Routing",
+    "value": {
+        "modules": [
+            {
+                "id": "fetch_data_sources",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  return [\n    { id: 'source_1', url: 'https://api.example.com/data1' },\n    { id: 'source_2', url: 'https://api.example.com/data2' },\n    { id: 'source_3', url: 'https://api.example.com/data3' }\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "process_sources",
+                "value": {
+                    "type": "forloopflow",
+                    "iterator": {
+                        "type": "javascript",
+                        "expr": "results.fetch_data_sources"
+                    },
+                    "skip_failures": false,
+                    "parallel": true,
+                    "modules": [
+                        {
+                            "id": "fetch_raw_data",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(source: { id: string; url: string }) {\n  // Mock fetch returning sample records\n  return {\n    source_id: source.id,\n    records: [\n      { id: 1, value: 'data_a', valid: true },\n      { id: 2, value: '', valid: false },\n      { id: 3, value: 'data_c', valid: true },\n      { id: 4, value: 'data_d', valid: true }\n    ]\n  };\n}",
+                                "input_transforms": {
+                                    "source": {
+                                        "type": "javascript",
+                                        "expr": "flow_input.iter.value"
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "id": "transform_data",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(raw_data: { source_id: string; records: any[] }) {\n  // Filter out invalid entries\n  const cleaned = raw_data.records.filter(r => r.valid && r.value);\n  return {\n    source_id: raw_data.source_id,\n    original_count: raw_data.records.length,\n    cleaned_count: cleaned.length,\n    records: cleaned\n  };\n}",
+                                "input_transforms": {
+                                    "raw_data": {
+                                        "type": "javascript",
+                                        "expr": "results.fetch_raw_data"
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "id": "validate_data",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(transformed: { source_id: string; original_count: number; cleaned_count: number; records: any[] }) {\n  // Calculate validation score based on data quality\n  const score = Math.round((transformed.cleaned_count / transformed.original_count) * 100);\n  return {\n    source_id: transformed.source_id,\n    records: transformed.records,\n    validation_score: score,\n    record_count: transformed.cleaned_count\n  };\n}",
+                                "input_transforms": {
+                                    "transformed": {
+                                        "type": "javascript",
+                                        "expr": "results.transform_data"
+                                    }
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "aggregate_data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(processed_sources: any[]) {\n  const all_records = processed_sources.flatMap(s => s.records);\n  const total_count = all_records.length;\n  return {\n    combined_records: all_records,\n    total_record_count: total_count,\n    sources_processed: processed_sources.length\n  };\n}",
+                    "input_transforms": {
+                        "processed_sources": {
+                            "type": "javascript",
+                            "expr": "results.process_sources"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "calculate_quality_score",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(processed_sources: any[]) {\n  const scores = processed_sources.map(s => s.validation_score);\n  const average = Math.round(scores.reduce((a, b) => a + b, 0) / scores.length);\n  return { quality_score: average, individual_scores: scores };\n}",
+                    "input_transforms": {
+                        "processed_sources": {
+                            "type": "javascript",
+                            "expr": "results.process_sources"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "route_by_quality",
+                "value": {
+                    "type": "branchone",
+                    "branches": [
+                        {
+                            "summary": "High quality - store in primary database",
+                            "expr": "results.calculate_quality_score.quality_score >= 90",
+                            "modules": [
+                                {
+                                    "id": "store_primary",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(data: any, score: number) {\n  return {\n    destination: 'primary_database',\n    status: 'success',\n    records_stored: data.total_record_count,\n    quality_score: score\n  };\n}",
+                                        "input_transforms": {
+                                            "data": {
+                                                "type": "javascript",
+                                                "expr": "results.aggregate_data"
+                                            },
+                                            "score": {
+                                                "type": "javascript",
+                                                "expr": "results.calculate_quality_score.quality_score"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        },
+                        {
+                            "summary": "Medium quality - store in secondary with warning",
+                            "expr": "results.calculate_quality_score.quality_score >= 70",
+                            "modules": [
+                                {
+                                    "id": "store_secondary",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(data: any, score: number) {\n  return {\n    destination: 'secondary_database',\n    status: 'warning',\n    warning_message: 'Data quality below optimal threshold',\n    records_stored: data.total_record_count,\n    quality_score: score\n  };\n}",
+                                        "input_transforms": {
+                                            "data": {
+                                                "type": "javascript",
+                                                "expr": "results.aggregate_data"
+                                            },
+                                            "score": {
+                                                "type": "javascript",
+                                                "expr": "results.calculate_quality_score.quality_score"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    "default": [
+                        {
+                            "id": "store_quarantine",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(data: any, score: number) {\n  return {\n    destination: 'quarantine',\n    status: 'alert',\n    alert_message: 'Data quality critically low - requires review',\n    records_quarantined: data.total_record_count,\n    quality_score: score\n  };\n}",
+                                "input_transforms": {
+                                    "data": {
+                                        "type": "javascript",
+                                        "expr": "results.aggregate_data"
+                                    },
+                                    "score": {
+                                        "type": "javascript",
+                                        "expr": "results.calculate_quality_score.quality_score"
+                                    }
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "generate_report",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(aggregated: any, quality: any, storage_result: any) {\n  return {\n    report: {\n      total_records: aggregated.total_record_count,\n      sources_processed: aggregated.sources_processed,\n      quality_score: quality.quality_score,\n      individual_scores: quality.individual_scores,\n      destination: storage_result.destination,\n      status: storage_result.status,\n      processed_at: new Date().toISOString()\n    }\n  };\n}",
+                    "input_transforms": {
+                        "aggregated": {
+                            "type": "javascript",
+                            "expr": "results.aggregate_data"
+                        },
+                        "quality": {
+                            "type": "javascript",
+                            "expr": "results.calculate_quality_score"
+                        },
+                        "storage_result": {
+                            "type": "javascript",
+                            "expr": "results.route_by_quality"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "type": "object",
+        "properties": {},
+        "required": []
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test4.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test4.json
new file mode 100644
index 0000000000000..e7ef1ff12749c
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test4.json
@@ -0,0 +1,175 @@
+{
+    "summary": "AI-Powered Customer Support with Tools",
+    "value": {
+        "modules": [
+            {
+                "id": "fetch_customer_profile",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(customer_id: string) {\n  // Mock customer profile and order history\n  return {\n    customer_id,\n    name: 'John Doe',\n    email: 'john.doe@example.com',\n    membership_tier: 'gold',\n    recent_orders: [\n      { order_id: 'ORD-001', date: '2024-01-15', total: 149.99, status: 'delivered' },\n      { order_id: 'ORD-002', date: '2024-02-20', total: 89.50, status: 'shipped' }\n    ],\n    support_history: [\n      { ticket_id: 'TKT-100', issue: 'Delivery delay', resolved: true }\n    ]\n  };\n}",
+                    "input_transforms": {
+                        "customer_id": {
+                            "type": "javascript",
+                            "expr": "flow_input.customer_id"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "support_agent",
+                "value": {
+                    "type": "aiagent",
+                    "input_transforms": {
+                        "provider": {
+                            "type": "static",
+                            "value": "$res:f/ai_providers/openai"
+                        },
+                        "output_type": {
+                            "type": "static",
+                            "value": "text"
+                        },
+                        "user_message": {
+                            "type": "javascript",
+                            "expr": "`Customer Profile:\nName: ${results.fetch_customer_profile.name}\nMembership: ${results.fetch_customer_profile.membership_tier}\nRecent Orders: ${JSON.stringify(results.fetch_customer_profile.recent_orders)}\n\nCustomer Query: ${flow_input.query_text}`"
+                        },
+                        "system_prompt": {
+                            "type": "static",
+                            "value": "You are a helpful customer support agent. Use the available tools to look up order information, check refund eligibility, create support tickets, or search FAQs to help resolve customer queries. Be professional and empathetic."
+                        }
+                    },
+                    "tools": [
+                        {
+                            "id": "lookup_order",
+                            "summary": "Look up order details by order ID. Returns order status, items, and shipping information.",
+                            "value": {
+                                "tool_type": "flowmodule",
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(order_id: string) {\n  // Mock order lookup\n  return {\n    order_id,\n    status: 'shipped',\n    items: [\n      { name: 'Wireless Headphones', quantity: 1, price: 79.99 },\n      { name: 'Phone Case', quantity: 2, price: 19.99 }\n    ],\n    shipping: {\n      carrier: 'FedEx',\n      tracking_number: 'FX123456789',\n      estimated_delivery: '2024-03-01'\n    },\n    total: 119.97\n  };\n}",
+                                "input_transforms": {
+                                    "order_id": {
+                                        "type": "static",
+                                        "value": null
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "id": "check_refund_eligibility",
+                            "summary": "Check if an order is eligible for refund. Returns eligibility status and reason.",
+                            "value": {
+                                "tool_type": "flowmodule",
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(order_id: string) {\n  // Mock refund eligibility check\n  return {\n    order_id,\n    eligible: true,\n    reason: 'Within 30-day return window',\n    refund_amount: 119.97,\n    refund_method: 'original_payment_method',\n    processing_time: '5-7 business days'\n  };\n}",
+                                "input_transforms": {
+                                    "order_id": {
+                                        "type": "static",
+                                        "value": null
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "id": "create_support_ticket",
+                            "summary": "Create a support ticket with specified description and priority (low, medium, high).",
+                            "value": {
+                                "tool_type": "flowmodule",
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(description: string, priority: 'low' | 'medium' | 'high') {\n  // Mock ticket creation\n  return {\n    ticket_id: `TKT-${Date.now()}`,\n    description,\n    priority,\n    status: 'open',\n    created_at: new Date().toISOString(),\n    estimated_response: priority === 'high' ? '2 hours' : priority === 'medium' ? '24 hours' : '48 hours'\n  };\n}",
+                                "input_transforms": {
+                                    "description": {
+                                        "type": "static",
+                                        "value": null
+                                    },
+                                    "priority": {
+                                        "type": "static",
+                                        "value": null
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "id": "search_faq",
+                            "summary": "Search the FAQ database for relevant answers to common questions.",
+                            "value": {
+                                "tool_type": "flowmodule",
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(search_query: string) {\n  // Mock FAQ search\n  return {\n    query: search_query,\n    results: [\n      {\n        question: 'How do I track my order?',\n        answer: 'You can track your order by logging into your account and clicking on \"Order History\". Each order has a tracking link.',\n        relevance: 0.95\n      },\n      {\n        question: 'What is the return policy?',\n        answer: 'We offer a 30-day return policy for all unused items in original packaging. Refunds are processed within 5-7 business days.',\n        relevance: 0.85\n      }\n    ]\n  };\n}",
+                                "input_transforms": {
+                                    "search_query": {
+                                        "type": "static",
+                                        "value": null
+                                    }
+                                }
+                            }
+                        }
+                    ],
+                    "parallel": true
+                }
+            },
+            {
+                "id": "log_interaction",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(customer_id: string, query: string, agent_response: string) {\n  // Mock audit logging\n  return {\n    logged: true,\n    log_id: `LOG-${Date.now()}`,\n    timestamp: new Date().toISOString(),\n    customer_id,\n    query_summary: query.substring(0, 100),\n    response_length: agent_response.length\n  };\n}",
+                    "input_transforms": {
+                        "customer_id": {
+                            "type": "javascript",
+                            "expr": "flow_input.customer_id"
+                        },
+                        "query": {
+                            "type": "javascript",
+                            "expr": "flow_input.query_text"
+                        },
+                        "agent_response": {
+                            "type": "javascript",
+                            "expr": "results.support_agent"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "return_response",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(agent_response: string, log_info: any, customer_profile: any) {\n  return {\n    response: agent_response,\n    customer_name: customer_profile.name,\n    interaction_logged: log_info.logged,\n    log_id: log_info.log_id\n  };\n}",
+                    "input_transforms": {
+                        "agent_response": {
+                            "type": "javascript",
+                            "expr": "results.support_agent"
+                        },
+                        "log_info": {
+                            "type": "javascript",
+                            "expr": "results.log_interaction"
+                        },
+                        "customer_profile": {
+                            "type": "javascript",
+                            "expr": "results.fetch_customer_profile"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "type": "object",
+        "properties": {
+            "customer_id": {
+                "type": "string",
+                "description": "The unique identifier of the customer"
+            },
+            "query_text": {
+                "type": "string",
+                "description": "The customer's support query text"
+            }
+        },
+        "required": ["customer_id", "query_text"]
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test5_modify_simple.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test5_modify_simple.json
new file mode 100644
index 0000000000000..29e11b7c46735
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test5_modify_simple.json
@@ -0,0 +1,68 @@
+{
+    "summary": "Simple data pipeline with validation",
+    "value": {
+        "modules": [
+            {
+                "id": "fetch_data",
+                "summary": "Fetch data from API",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  // Mock API call\n  return [\n    { id: 1, name: \"Item 1\", value: 100 },\n    { id: 2, name: \"Item 2\", value: 200 },\n    { id: 3, name: \"Item 3\", value: 300 }\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "process_data",
+                "summary": "Process the fetched data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(data: any[]) {\n  // Apply some transformation\n  return data.map(item => ({\n    ...item,\n    value: item.value * 1.1,\n    processed: true\n  }));\n}",
+                    "input_transforms": {
+                        "data": {
+                            "type": "javascript",
+                            "expr": "results.fetch_data"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "validate_data",
+                "summary": "Validate processed data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(data: any[]) {\n  if (!data || data.length === 0) {\n    return { error: true, message: \"No data to save\" };\n  }\n  return { error: false, data: data };\n}",
+                    "input_transforms": {
+                        "data": {
+                            "type": "javascript",
+                            "expr": "results.process_data"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "save_results",
+                "summary": "Save results to database",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(validated: any) {\n  if (validated.error) {\n    return { saved: 0, status: \"skipped\", reason: validated.message };\n  }\n  // Mock database save\n  console.log(`Saving ${validated.data.length} items to database`);\n  return { saved: validated.data.length, status: \"success\" };\n}",
+                    "input_transforms": {
+                        "validated": {
+                            "type": "javascript",
+                            "expr": "results.validate_data"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {},
+        "required": [],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test6_modify_medium.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test6_modify_medium.json
new file mode 100644
index 0000000000000..a638c64c33f3a
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test6_modify_medium.json
@@ -0,0 +1,142 @@
+{
+    "summary": "Order processing flow with type-based branching",
+    "value": {
+        "modules": [
+            {
+                "id": "get_orders",
+                "summary": "Fetch list of orders",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  // Mock orders from database\n  return [\n    { id: \"ORD-001\", type: \"express\", items: 3, total: 150 },\n    { id: \"ORD-002\", type: \"standard\", items: 5, total: 280 },\n    { id: \"ORD-003\", type: \"pickup\", items: 2, total: 75 },\n    { id: \"ORD-004\", type: \"express\", items: 1, total: 50 }\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "loop_orders",
+                "summary": "Process each order",
+                "value": {
+                    "type": "forloopflow",
+                    "iterator": {
+                        "type": "javascript",
+                        "expr": "results.get_orders"
+                    },
+                    "skip_failures": false,
+                    "parallel": false,
+                    "modules": [
+                        {
+                            "id": "branch_order_type",
+                            "summary": "Branch based on order type",
+                            "value": {
+                                "type": "branchone",
+                                "branches": [
+                                    {
+                                        "summary": "Express Order",
+                                        "expr": "flow_input.iter.value.type === 'express'",
+                                        "modules": [
+                                            {
+                                                "id": "handle_express",
+                                                "summary": "Handle express order",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(order: any) {\n  // Mark as priority and calculate express shipping\n  const expressShippingCost = 15.99;\n  return {\n    orderId: order.id,\n    priority: true,\n    shippingCost: expressShippingCost,\n    shippingType: \"express\",\n    estimatedDays: 1\n  };\n}",
+                                                    "input_transforms": {
+                                                        "order": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    },
+                                    {
+                                        "summary": "Standard Order",
+                                        "expr": "flow_input.iter.value.type === 'standard'",
+                                        "modules": [
+                                            {
+                                                "id": "handle_standard",
+                                                "summary": "Handle standard order",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(order: any) {\n  // Calculate standard shipping cost\n  const standardShippingCost = 5.99;\n  return {\n    orderId: order.id,\n    priority: false,\n    shippingCost: standardShippingCost,\n    shippingType: \"standard\",\n    estimatedDays: 5\n  };\n}",
+                                                    "input_transforms": {
+                                                        "order": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    },
+                                    {
+                                        "summary": "Pickup Order",
+                                        "expr": "flow_input.iter.value.type === 'pickup'",
+                                        "modules": [
+                                            {
+                                                "id": "handle_pickup",
+                                                "summary": "Handle pickup order",
+                                                "value": {
+                                                    "type": "rawscript",
+                                                    "language": "bun",
+                                                    "content": "export async function main(order: any) {\n  // Mark as no shipping required\n  return {\n    orderId: order.id,\n    priority: false,\n    shippingCost: 0,\n    shippingType: \"pickup\",\n    estimatedDays: 0,\n    pickupLocation: \"Store #1\"\n  };\n}",
+                                                    "input_transforms": {
+                                                        "order": {
+                                                            "type": "javascript",
+                                                            "expr": "flow_input.iter.value"
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        ]
+                                    }
+                                ],
+                                "default": [
+                                    {
+                                        "id": "process_order",
+                                        "summary": "Process unknown order type",
+                                        "value": {
+                                            "type": "rawscript",
+                                            "language": "bun",
+                                            "content": "export async function main(order: any) {\n  console.log(`Processing order ${order.id} with unknown type`);\n  return {\n    orderId: order.id,\n    processed: true,\n    shippingType: \"unknown\",\n    timestamp: new Date().toISOString()\n  };\n}",
+                                            "input_transforms": {
+                                                "order": {
+                                                    "type": "javascript",
+                                                    "expr": "flow_input.iter.value"
+                                                }
+                                            }
+                                        }
+                                    }
+                                ]
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "summarize",
+                "summary": "Return processing summary",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(results: any[]) {\n  return {\n    totalProcessed: results.length,\n    processedAt: new Date().toISOString()\n  };\n}",
+                    "input_transforms": {
+                        "results": {
+                            "type": "javascript",
+                            "expr": "results.loop_orders"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {},
+        "required": [],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test7_modify_complex.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test7_modify_complex.json
new file mode 100644
index 0000000000000..eb3bf53cbecd5
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/expected/test7_modify_complex.json
@@ -0,0 +1,136 @@
+{
+    "summary": "Data enrichment flow with parallel processing",
+    "value": {
+        "modules": [
+            {
+                "id": "get_item",
+                "summary": "Get item from input",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item_id: string) {\n  // Mock item lookup\n  return {\n    id: item_id,\n    name: \"Product \" + item_id,\n    sku: \"SKU-\" + item_id\n  };\n}",
+                    "input_transforms": {
+                        "item_id": {
+                            "type": "javascript",
+                            "expr": "flow_input.item_id"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "parallel_enrichment",
+                "summary": "Enrich data in parallel",
+                "value": {
+                    "type": "branchall",
+                    "branches": [
+                        {
+                            "summary": "Price enrichment",
+                            "modules": [
+                                {
+                                    "id": "enrich_price",
+                                    "summary": "Call pricing API",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(item: any) {\n  // Mock pricing API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      price: 99.99,\n      currency: \"USD\",\n      discount: 10\n    };\n  } catch (e) {\n    return { itemId: item.id, price: 0, currency: \"USD\", fallback: true };\n  }\n}",
+                                        "input_transforms": {
+                                            "item": {
+                                                "type": "javascript",
+                                                "expr": "results.get_item"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        },
+                        {
+                            "summary": "Inventory enrichment",
+                            "modules": [
+                                {
+                                    "id": "enrich_inventory",
+                                    "summary": "Call inventory API",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(item: any) {\n  // Mock inventory API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      inStock: true,\n      quantity: 150,\n      warehouse: \"WH-001\"\n    };\n  } catch (e) {\n    return { itemId: item.id, inStock: false, quantity: 0, fallback: true };\n  }\n}",
+                                        "input_transforms": {
+                                            "item": {
+                                                "type": "javascript",
+                                                "expr": "results.get_item"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        },
+                        {
+                            "summary": "Reviews enrichment",
+                            "modules": [
+                                {
+                                    "id": "enrich_reviews",
+                                    "summary": "Call reviews API",
+                                    "value": {
+                                        "type": "rawscript",
+                                        "language": "bun",
+                                        "content": "export async function main(item: any) {\n  // Mock reviews API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      averageRating: 4.5,\n      reviewCount: 127,\n      topReview: \"Great product!\"\n    };\n  } catch (e) {\n    return { itemId: item.id, averageRating: 0, reviewCount: 0, fallback: true };\n  }\n}",
+                                        "input_transforms": {
+                                            "item": {
+                                                "type": "javascript",
+                                                "expr": "results.get_item"
+                                            }
+                                        }
+                                    }
+                                }
+                            ]
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "combine_data",
+                "summary": "Combine all enrichment data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item: any, parallel_results: any) {\n  // Extract results from parallel branches\n  const [priceResult, inventoryResult, reviewsResult] = parallel_results;\n  return {\n    ...item,\n    pricing: priceResult,\n    inventory: inventoryResult,\n    reviews: reviewsResult,\n    hasFallbacks: priceResult?.fallback || inventoryResult?.fallback || reviewsResult?.fallback\n  };\n}",
+                    "input_transforms": {
+                        "item": {
+                            "type": "javascript",
+                            "expr": "results.get_item"
+                        },
+                        "parallel_results": {
+                            "type": "javascript",
+                            "expr": "results.parallel_enrichment"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "return_result",
+                "summary": "Return final result",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(enriched_item: any) {\n  return {\n    success: true,\n    data: enriched_item,\n    enrichedAt: new Date().toISOString()\n  };\n}",
+                    "input_transforms": {
+                        "enriched_item": {
+                            "type": "javascript",
+                            "expr": "results.combine_data"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {
+            "item_id": {
+                "type": "string",
+                "description": "The ID of the item to enrich"
+            }
+        },
+        "required": ["item_id"],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/flowChat.eval.test.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/flowChat.eval.test.ts
new file mode 100644
index 0000000000000..ca4cb327d3884
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/flowChat.eval.test.ts
@@ -0,0 +1,388 @@
+import { describe, it, expect } from 'vitest'
+import { runVariantComparison, type ExpectedFlow } from './evalRunner'
+import { writeComparisonResults } from './evalResultsWriter'
+import { BASELINE_VARIANT, MINIMAL_SINGLE_TOOL_VARIANT, NO_FULL_SCHEMA_VARIANT } from './variants'
+// @ts-ignore - JSON import
+import expectedTest1 from './expected/test1.json'
+// @ts-ignore - JSON import
+import expectedTest2 from './expected/test2.json'
+// @ts-ignore - JSON import
+import expectedTest3 from './expected/test3.json'
+// @ts-ignore - JSON import
+import expectedTest4 from './expected/test4.json'
+// @ts-ignore - JSON import
+import expectedTest5 from './expected/test5_modify_simple.json'
+// @ts-ignore - JSON import
+import expectedTest6 from './expected/test6_modify_medium.json'
+// @ts-ignore - JSON import
+import expectedTest7 from './expected/test7_modify_complex.json'
+// @ts-ignore - JSON import
+import initialTest5 from './initial/test5_initial.json'
+// @ts-ignore - JSON import
+import initialTest6 from './initial/test6_initial.json'
+// @ts-ignore - JSON import
+import initialTest7 from './initial/test7_initial.json'
+import type { FlowModule } from '$lib/gen'
+
+// Get API key from environment - tests will be skipped if not set
+// @ts-ignore
+// const OPENAI_API_KEY = process.env.OPENAI_API_KEY
+const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY
+
+// Skip all tests if no API key is provided
+// const describeWithApiKey = OPENAI_API_KEY ? describe : describe.skip
+const describeWithApiKey = OPENROUTER_API_KEY ? describe : describe.skip
+
+const MODELS = ['google/gemini-2.5-flash', 'anthropic/claude-haiku-4.5', 'openai/gpt-4o']
+
+const VARIANTS = [
+	...MODELS.map((model) => ({
+		...BASELINE_VARIANT,
+		model,
+		name: `baseline-${model.replace('/', '-')}`
+	})),
+	...MODELS.map((model) => ({
+		...NO_FULL_SCHEMA_VARIANT,
+		model,
+		name: `no-full-schema-${model.replace('/', '-')}`
+	})),
+	...MODELS.map((model) => ({
+		...MINIMAL_SINGLE_TOOL_VARIANT,
+		model,
+		name: `minimal-single-tool-${model.replace('/', '-')}`
+	}))
+]
+
+describeWithApiKey('Flow Chat LLM Evaluation', () => {
+	const TEST_TIMEOUT = 120_000
+	if (!OPENROUTER_API_KEY) {
+		console.warn('OPENROUTER_API_KEY is not set, skipping tests')
+	}
+
+	it(
+		'test1: user role-based actions with loop and branches',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+STEP 1: Fetch mock users from api
+STEP 2: Filter only active users:
+STEP 3: Loop on all users
+STEP 4: Do branches based on user's role, do different action based on that. Roles are admin, user, moderator
+STEP 5: Return action taken for each user
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				expectedFlow: expectedTest1 as ExpectedFlow
+			})
+
+			// Write results to files
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			// Assert all variants succeeded
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				// Log evaluation results
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	it(
+		'test2: e-commerce order processing with inventory check and branching',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+STEP 1: Receive order data from input (order has items array with name/price/quantity, customer_email, shipping_address)
+STEP 2: Validate order - check all items have valid price > 0 and quantity > 0, return validation result
+STEP 3: Calculate order total with 8% tax rate
+STEP 4: Check inventory for each item (loop through items, return mock availability)
+STEP 5: Branch based on inventory - if all items available, create shipment record; otherwise create backorder record
+STEP 6: Send confirmation (mock email to customer_email)
+STEP 7: Return final order summary with status
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				expectedFlow: expectedTest2 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	it(
+		'test3: data pipeline with parallel processing and quality-based routing',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+STEP 1: Fetch list of data sources from configuration (return mock array of 3 source objects with id and url)
+STEP 2: For each data source in parallel:
+  - Fetch raw data from the source (mock fetch returning sample records)
+  - Transform/clean the data (filter out invalid entries)
+  - Validate the transformed data (return validation score 0-100)
+STEP 3: Aggregate all validated data into single dataset with combined records
+STEP 4: Calculate overall data quality score (average of all validation scores)
+STEP 5: Branch based on quality score:
+  - If score >= 90: Store in primary database and return success
+  - If score >= 70 and < 90: Store in secondary database with warning flag
+  - If score < 70: Store in quarantine and send alert
+STEP 6: Return processing report with statistics (total records, quality score, destination)
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				expectedFlow: expectedTest3 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	it(
+		'test4: AI agent with tools for customer support',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+Create a customer support flow with an AI agent:
+
+STEP 1: Receive customer query from input (customer_id string, query_text string)
+STEP 2: Fetch customer profile and order history (mock data based on customer_id)
+STEP 3: Use an AI agent to handle the customer query. The agent should have access to these tools:
+  - lookup_order: Takes order_id, returns order details (mock data)
+  - check_refund_eligibility: Takes order_id, returns eligibility status and reason
+  - create_support_ticket: Takes description and priority (low/medium/high), returns ticket_id
+  - search_faq: Takes search_query, returns relevant FAQ answers
+  The agent should use the customer profile context and respond helpfully.
+STEP 4: Log the interaction to audit trail (customer_id, query, response summary)
+STEP 5: Return the agent's response and any actions taken
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				expectedFlow: expectedTest4 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	// ==================== MODIFICATION TESTS ====================
+	// These tests evaluate the LLM's ability to modify existing flows
+
+	it(
+		'test5: simple modification - add validation step to existing flow',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+Modify this existing flow to add error handling:
+- Add a new step after process_data called "validate_data" to validate the processed data
+- The validation step should check if the data array is not empty
+- If validation fails (empty array), it should return an error object with message "No data to save"
+- If validation passes, return the data for the next step
+- Update save_results to handle the validation result appropriately
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				initialModules: initialTest5.value.modules as FlowModule[],
+				initialSchema: initialTest5.schema,
+				expectedFlow: expectedTest5 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	it(
+		'test6: medium modification - add branching inside existing loop',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+Modify the order processing loop to handle different order types:
+- Inside the loop_orders, replace the simple process_order step with branching based on order.type
+- For type "express": add a step called handle_express that marks as priority and calculates express shipping cost ($15.99)
+- For type "standard": add a step called handle_standard that calculates standard shipping cost ($5.99)
+- For type "pickup": add a step called handle_pickup that marks as no shipping required (cost $0)
+- Move the original process_order step to the default branch for unknown order types
+- Each branch step should return the orderId, shipping cost, and shipping type
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				initialModules: initialTest6.value.modules as FlowModule[],
+				initialSchema: initialTest6.schema,
+				expectedFlow: expectedTest6 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+
+	it(
+		'test7: complex modification - refactor sequential to parallel execution',
+		async () => {
+			const USER_PROMPT = `
+THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
+
+Refactor this flow for better performance by parallelizing the enrichment steps:
+- The three enrichment steps (enrich_price, enrich_inventory, enrich_reviews) currently run sequentially
+- Wrap them in a parallel branch (branchall) called "parallel_enrichment" so they run concurrently
+- Each enrichment step should include basic error handling with try/catch that returns a fallback value if it fails
+- Update the combine_data step to receive results from the parallel branch (results.parallel_enrichment returns an array of branch results)
+- The combine_data step should check if any enrichment used a fallback value and set a hasFallbacks flag
+- Keep get_item as the first step and return_result as the last step unchanged
+`
+			const results = await runVariantComparison(USER_PROMPT, VARIANTS, OPENROUTER_API_KEY!, {
+				initialModules: initialTest7.value.modules as FlowModule[],
+				initialSchema: initialTest7.schema,
+				expectedFlow: expectedTest7 as ExpectedFlow
+			})
+
+			const { summaryPath, flowPaths } = await writeComparisonResults(USER_PROMPT, results)
+			console.log(`\nResults written to: ${summaryPath}`)
+			console.log(`Flow files: ${flowPaths.join(', ')}`)
+
+			for (const result of results) {
+				expect(true).toBe(true)
+
+				if (result.evaluationResult) {
+					console.log(
+						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
+					)
+					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
+					if (
+						result.evaluationResult.missingRequirements &&
+						result.evaluationResult.missingRequirements.length > 0
+					) {
+						console.log(
+							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
+						)
+					}
+				}
+			}
+		},
+		TEST_TIMEOUT
+	)
+})
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test5_initial.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test5_initial.json
new file mode 100644
index 0000000000000..8f421b7e3964a
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test5_initial.json
@@ -0,0 +1,53 @@
+{
+    "summary": "Simple data pipeline",
+    "value": {
+        "modules": [
+            {
+                "id": "fetch_data",
+                "summary": "Fetch data from API",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  // Mock API call\n  return [\n    { id: 1, name: \"Item 1\", value: 100 },\n    { id: 2, name: \"Item 2\", value: 200 },\n    { id: 3, name: \"Item 3\", value: 300 }\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "process_data",
+                "summary": "Process the fetched data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(data: any[]) {\n  // Apply some transformation\n  return data.map(item => ({\n    ...item,\n    value: item.value * 1.1,\n    processed: true\n  }));\n}",
+                    "input_transforms": {
+                        "data": {
+                            "type": "javascript",
+                            "expr": "results.fetch_data"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "save_results",
+                "summary": "Save results to database",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(data: any[]) {\n  // Mock database save\n  console.log(`Saving ${data.length} items to database`);\n  return { saved: data.length, status: \"success\" };\n}",
+                    "input_transforms": {
+                        "data": {
+                            "type": "javascript",
+                            "expr": "results.process_data"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {},
+        "required": [],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test6_initial.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test6_initial.json
new file mode 100644
index 0000000000000..c95cf3a45c9e8
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test6_initial.json
@@ -0,0 +1,68 @@
+{
+    "summary": "Order processing flow",
+    "value": {
+        "modules": [
+            {
+                "id": "get_orders",
+                "summary": "Fetch list of orders",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main() {\n  // Mock orders from database\n  return [\n    { id: \"ORD-001\", type: \"express\", items: 3, total: 150 },\n    { id: \"ORD-002\", type: \"standard\", items: 5, total: 280 },\n    { id: \"ORD-003\", type: \"pickup\", items: 2, total: 75 },\n    { id: \"ORD-004\", type: \"express\", items: 1, total: 50 }\n  ];\n}",
+                    "input_transforms": {}
+                }
+            },
+            {
+                "id": "loop_orders",
+                "summary": "Process each order",
+                "value": {
+                    "type": "forloopflow",
+                    "iterator": {
+                        "type": "javascript",
+                        "expr": "results.get_orders"
+                    },
+                    "skip_failures": false,
+                    "parallel": false,
+                    "modules": [
+                        {
+                            "id": "process_order",
+                            "summary": "Process individual order",
+                            "value": {
+                                "type": "rawscript",
+                                "language": "bun",
+                                "content": "export async function main(order: any) {\n  console.log(`Processing order ${order.id}`);\n  return {\n    orderId: order.id,\n    processed: true,\n    timestamp: new Date().toISOString()\n  };\n}",
+                                "input_transforms": {
+                                    "order": {
+                                        "type": "javascript",
+                                        "expr": "flow_input.iter.value"
+                                    }
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            {
+                "id": "summarize",
+                "summary": "Return processing summary",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(results: any[]) {\n  return {\n    totalProcessed: results.length,\n    processedAt: new Date().toISOString()\n  };\n}",
+                    "input_transforms": {
+                        "results": {
+                            "type": "javascript",
+                            "expr": "results.loop_orders"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {},
+        "required": [],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test7_initial.json b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test7_initial.json
new file mode 100644
index 0000000000000..5202cd3587e15
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/initial/test7_initial.json
@@ -0,0 +1,120 @@
+{
+    "summary": "Data enrichment flow",
+    "value": {
+        "modules": [
+            {
+                "id": "get_item",
+                "summary": "Get item from input",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item_id: string) {\n  // Mock item lookup\n  return {\n    id: item_id,\n    name: \"Product \" + item_id,\n    sku: \"SKU-\" + item_id\n  };\n}",
+                    "input_transforms": {
+                        "item_id": {
+                            "type": "javascript",
+                            "expr": "flow_input.item_id"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "enrich_price",
+                "summary": "Call pricing API",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item: any) {\n  // Mock pricing API call\n  return {\n    itemId: item.id,\n    price: 99.99,\n    currency: \"USD\",\n    discount: 10\n  };\n}",
+                    "input_transforms": {
+                        "item": {
+                            "type": "javascript",
+                            "expr": "results.get_item"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "enrich_inventory",
+                "summary": "Call inventory API",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item: any) {\n  // Mock inventory API call\n  return {\n    itemId: item.id,\n    inStock: true,\n    quantity: 150,\n    warehouse: \"WH-001\"\n  };\n}",
+                    "input_transforms": {
+                        "item": {
+                            "type": "javascript",
+                            "expr": "results.get_item"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "enrich_reviews",
+                "summary": "Call reviews API",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item: any) {\n  // Mock reviews API call\n  return {\n    itemId: item.id,\n    averageRating: 4.5,\n    reviewCount: 127,\n    topReview: \"Great product!\"\n  };\n}",
+                    "input_transforms": {
+                        "item": {
+                            "type": "javascript",
+                            "expr": "results.get_item"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "combine_data",
+                "summary": "Combine all enrichment data",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(item: any, price: any, inventory: any, reviews: any) {\n  return {\n    ...item,\n    pricing: price,\n    inventory: inventory,\n    reviews: reviews\n  };\n}",
+                    "input_transforms": {
+                        "item": {
+                            "type": "javascript",
+                            "expr": "results.get_item"
+                        },
+                        "price": {
+                            "type": "javascript",
+                            "expr": "results.enrich_price"
+                        },
+                        "inventory": {
+                            "type": "javascript",
+                            "expr": "results.enrich_inventory"
+                        },
+                        "reviews": {
+                            "type": "javascript",
+                            "expr": "results.enrich_reviews"
+                        }
+                    }
+                }
+            },
+            {
+                "id": "return_result",
+                "summary": "Return final result",
+                "value": {
+                    "type": "rawscript",
+                    "language": "bun",
+                    "content": "export async function main(enriched_item: any) {\n  return {\n    success: true,\n    data: enriched_item,\n    enrichedAt: new Date().toISOString()\n  };\n}",
+                    "input_transforms": {
+                        "enriched_item": {
+                            "type": "javascript",
+                            "expr": "results.combine_data"
+                        }
+                    }
+                }
+            }
+        ]
+    },
+    "schema": {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "properties": {
+            "item_id": {
+                "type": "string",
+                "description": "The ID of the item to enrich"
+            }
+        },
+        "required": ["item_id"],
+        "type": "object"
+    }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/baseline.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/baseline.ts
new file mode 100644
index 0000000000000..7f6b743cb8dce
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/baseline.ts
@@ -0,0 +1,12 @@
+import type { VariantConfig } from '../evalVariants'
+
+/**
+ * Baseline variant - uses the production system prompt and all tools.
+ * This is the default configuration that matches the actual flow chat implementation.
+ */
+export const BASELINE_VARIANT: VariantConfig = {
+	name: 'baseline',
+	description: 'Production configuration with default system prompt and all tools',
+	systemPrompt: { type: 'default' },
+	tools: { type: 'default' }
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/index.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/index.ts
new file mode 100644
index 0000000000000..ffaec19031f5b
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/index.ts
@@ -0,0 +1,7 @@
+// Re-export all variant configurations
+export { BASELINE_VARIANT } from './baseline'
+export { MINIMAL_SINGLE_TOOL_VARIANT, setFlowJsonTool } from './minimal-single-tool'
+export { NO_FULL_SCHEMA_VARIANT } from './no-full-schema'
+
+// Re-export types for convenience
+export type { VariantConfig } from '../evalVariants'
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/minimal-single-tool.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/minimal-single-tool.ts
new file mode 100644
index 0000000000000..7201169257e02
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/minimal-single-tool.ts
@@ -0,0 +1,403 @@
+import type { VariantConfig } from '../evalVariants'
+import type { Tool } from '../../../../shared'
+import type { FlowAIChatHelpers } from '../../../core'
+import { flowTools, formatOpenFlowSchemaForPrompt } from '../../../core'
+
+/**
+ * IDs of the granular flow editing tools that should be replaced by set_flow_json.
+ */
+const FLOW_EDITING_TOOL_NAMES = [
+	'add_module',
+	'remove_module',
+	'remove_branch',
+	'modify_module',
+	'set_flow_schema'
+]
+
+/**
+ * A single tool that sets the entire flow JSON at once.
+ * This replaces the granular flow editing tools (add_module, remove_module, modify_module, etc.)
+ */
+export const setFlowJsonTool: Tool<FlowAIChatHelpers> = {
+	def: {
+		type: 'function',
+		function: {
+			name: 'set_flow_json',
+			description:
+				'Set the entire flow by providing the complete flow object. This replaces all existing modules and schema.',
+			strict: false,
+			parameters: {
+				type: 'object',
+				properties: {
+					modules: {
+						type: 'array',
+						description: 'Array of flow modules',
+						items: {
+							type: 'object'
+						}
+					},
+					schema: {
+						type: 'object',
+						description:
+							'Flow input schema (JSON Schema format) defining parameters the flow accepts'
+					}
+				},
+				required: ['modules']
+			}
+		}
+	},
+	fn: async ({ args, helpers }) => {
+		const { modules, schema } = args
+		const flowValue: Record<string, unknown> = { modules }
+		if (schema) {
+			flowValue.schema = schema
+		}
+		await helpers.setFlowJson(JSON.stringify(flowValue))
+		return `Flow updated with ${modules.length} module(s): [${modules.map((m: any) => m.id).join(', ')}]`
+	}
+}
+
+/**
+ * Build the tools array for the minimal-single-tool variant.
+ * Keeps all utility tools (search, resource type, test run, db schema, code generation instructions)
+ * but replaces all flow editing tools with a single set_flow_json tool.
+ */
+function buildMinimalSingleToolTools(): Tool<FlowAIChatHelpers>[] {
+	// Get all production tools except flow editing tools
+	const utilityTools = flowTools.filter(
+		(t) => !FLOW_EDITING_TOOL_NAMES.includes(t.def.function.name)
+	)
+
+	return [...utilityTools, setFlowJsonTool]
+}
+
+const MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits workflows on the Windmill platform.
+
+## IMPORTANT RULES
+
+**Reserved IDs - Do NOT use these module IDs:**
+- \`failure\` - Reserved for failure handler module
+- \`preprocessor\` - Reserved for preprocessor module
+- \`Input\` - Reserved for flow input reference
+
+## Tool Selection Guide
+
+**Flow Modification:**
+- **Create or modify the entire flow** → \`set_flow_json\` (provide complete modules array and optional schema)
+
+**Code & Scripts:**
+- **Get language-specific coding instructions** → \`get_instructions_for_code_generation\` (call BEFORE writing code)
+- **Find workspace scripts** → \`search_scripts\`
+- **Find Windmill Hub scripts** → \`search_hub_scripts\`
+
+**Testing:**
+- **Test entire flow** → \`test_run_flow\`
+- **Test single step** → \`test_run_step\`
+
+**Resources & Schema:**
+- **Search resource types** → \`resource_type\`
+- **Get database schema** → \`get_db_schema\`
+
+## Common Mistakes to Avoid
+
+- **Don't forget \`input_transforms\`** - Rawscript parameters won't receive values without them
+- **Don't use spaces in module IDs** - Use underscores (e.g., \`fetch_data\` not \`fetch data\`)
+- **Don't reference future steps** - \`results.step_id\` only works for steps that execute before the current one
+- **Don't create duplicate IDs** - Each module ID must be unique in the flow
+
+## Flow Modification with set_flow_json
+
+Use the \`set_flow_json\` tool to set the entire flow structure at once. Provide the complete modules array and optionally the flow input schema.
+
+**Parameters:**
+- \`modules\`: Array of flow modules (required)
+- \`schema\`: Flow input schema in JSON Schema format (optional)
+
+**Example - Simple flow:**
+\`\`\`javascript
+set_flow_json({
+  modules: [
+    {
+      id: "fetch_data",
+      summary: "Fetch user data from API",
+      value: {
+        type: "rawscript",
+        language: "bun",
+        content: "export async function main(userId: string) { return { id: userId, name: 'John' }; }",
+        input_transforms: {
+          userId: { type: "javascript", expr: "flow_input.user_id" }
+        }
+      }
+    },
+    {
+      id: "process_data",
+      summary: "Process the fetched data",
+      value: {
+        type: "rawscript",
+        language: "bun",
+        content: "export async function main(data: any) { return { processed: true, ...data }; }",
+        input_transforms: {
+          data: { type: "javascript", expr: "results.fetch_data" }
+        }
+      }
+    }
+  ],
+  schema: {
+    type: "object",
+    properties: {
+      user_id: { type: "string", description: "User ID to fetch" }
+    },
+    required: ["user_id"]
+  }
+})
+\`\`\`
+
+**Example - Flow with for loop:**
+\`\`\`javascript
+set_flow_json({
+  modules: [
+    {
+      id: "get_items",
+      summary: "Get list of items",
+      value: {
+        type: "rawscript",
+        language: "bun",
+        content: "export async function main() { return [1, 2, 3]; }",
+        input_transforms: {}
+      }
+    },
+    {
+      id: "loop_items",
+      summary: "Process each item",
+      value: {
+        type: "forloopflow",
+        iterator: { type: "javascript", expr: "results.get_items" },
+        skip_failures: false,
+        parallel: true,
+        modules: [
+          {
+            id: "process_item",
+            summary: "Process single item",
+            value: {
+              type: "rawscript",
+              language: "bun",
+              content: "export async function main(item: number) { return item * 2; }",
+              input_transforms: {
+                item: { type: "javascript", expr: "flow_input.iter.value" }
+              }
+            }
+          }
+        ]
+      }
+    }
+  ]
+})
+\`\`\`
+
+**Example - Flow with branches (branchone):**
+\`\`\`javascript
+set_flow_json({
+  modules: [
+    {
+      id: "get_value",
+      summary: "Get a value to branch on",
+      value: {
+        type: "rawscript",
+        language: "bun",
+        content: "export async function main() { return 50; }",
+        input_transforms: {}
+      }
+    },
+    {
+      id: "branch_on_value",
+      summary: "Branch based on value",
+      value: {
+        type: "branchone",
+        branches: [
+          {
+            summary: "High value",
+            expr: "results.get_value > 75",
+            modules: [
+              {
+                id: "high_handler",
+                value: {
+                  type: "rawscript",
+                  language: "bun",
+                  content: "export async function main() { return 'high'; }",
+                  input_transforms: {}
+                }
+              }
+            ]
+          },
+          {
+            summary: "Medium value",
+            expr: "results.get_value > 25",
+            modules: [
+              {
+                id: "medium_handler",
+                value: {
+                  type: "rawscript",
+                  language: "bun",
+                  content: "export async function main() { return 'medium'; }",
+                  input_transforms: {}
+                }
+              }
+            ]
+          }
+        ],
+        default: [
+          {
+            id: "low_handler",
+            value: {
+              type: "rawscript",
+              language: "bun",
+              content: "export async function main() { return 'low'; }",
+              input_transforms: {}
+            }
+          }
+        ]
+      }
+    }
+  ]
+})
+\`\`\`
+
+Follow the user instructions carefully.
+At the end of your changes, explain precisely what you did and what the flow does now.
+ALWAYS test your modifications using the \`test_run_flow\` tool. If the user cancels the test run, do not try again and wait for the next user instruction.
+When testing steps that are sql scripts, the arguments to be passed are { database: $res:<db_resource> }.
+
+### Input Transforms for Rawscripts
+
+Rawscript modules use \`input_transforms\` to map function parameters to values. Each key in \`input_transforms\` corresponds to a parameter name in your script's \`main\` function.
+
+**Transform Types:**
+- \`static\`: Fixed value passed directly
+- \`javascript\`: Dynamic expression evaluated at runtime
+
+**Available Variables in JavaScript Expressions:**
+- \`flow_input.{property}\` - Access flow input parameters
+- \`results.{step_id}\` - Access output from a previous step
+- \`flow_input.iter.value\` - Current item when inside a for-loop
+- \`flow_input.iter.index\` - Current index when inside a for-loop
+
+**Example - Rawscript using flow input and previous step result:**
+\`\`\`json
+{
+  "id": "step_b",
+  "value": {
+    "type": "rawscript",
+    "language": "bun",
+    "content": "export async function main(userId: string, data: any[]) { return 'Hello, world!'; }",
+    "input_transforms": {
+      "userId": { "type": "javascript", "expr": "flow_input.user_id" },
+      "data": { "type": "javascript", "expr": "results.step_a" }
+    }
+  }
+}
+\`\`\`
+
+**Important:** The parameter names in \`input_transforms\` must match the function parameter names in your script.
+
+### Other Key Concepts
+- **Resources**: For flow inputs, use type "object" with format "resource-<type>". For step inputs, use "$res:path/to/resource"
+- **Module IDs**: Must be unique and valid identifiers. Used to reference results via \`results.step_id\`
+- **Module types**: Use 'bun' as default language for rawscript if unspecified
+
+### Writing Code for Modules
+
+**IMPORTANT: Before writing any code for a rawscript module, you MUST call the \`get_instructions_for_code_generation\` tool with the target language.** This tool provides essential language-specific instructions.
+
+Example: Before writing TypeScript/Bun code, call \`get_instructions_for_code_generation({ language: "bun" })\`
+
+### Creating Flows
+
+1. **Search for existing scripts first** (unless user explicitly asks to write from scratch):
+   - First: \`search_scripts\` to find workspace scripts
+   - Then: \`search_hub_scripts\` (only consider highly relevant results)
+   - Only create raw scripts if no suitable script is found
+
+2. **Build the complete flow using \`set_flow_json\`:**
+   - If using existing script: use \`type: "script"\` with \`path\`
+   - If creating rawscript: use \`type: "rawscript"\` with \`language\` and \`content\`
+   - **First call \`get_instructions_for_code_generation\` to get the correct code format**
+   - Always define \`input_transforms\` to connect parameters to flow inputs or previous step results
+
+### AI Agent Modules
+
+AI agents can use tools to accomplish tasks. When creating an AI agent module:
+
+\`\`\`javascript
+{
+  id: "support_agent",
+  summary: "AI agent for customer support",
+  value: {
+    type: "aiagent",
+    input_transforms: {
+      provider: { type: "static", value: "$res:f/ai_providers/openai" },
+      output_type: { type: "static", value: "text" },
+      user_message: { type: "javascript", expr: "flow_input.query" },
+      system_prompt: { type: "static", value: "You are a helpful assistant." }
+    },
+    tools: [
+      {
+        id: "search_docs",
+        summary: "Search_documentation",
+        value: {
+          tool_type: "flowmodule",
+          type: "rawscript",
+          language: "bun",
+          content: "export async function main(query: string) { return ['doc1', 'doc2']; }",
+          input_transforms: { query: { type: "static", value: "" } }
+        }
+      }
+    ]
+  }
+}
+\`\`\`
+
+- **Tool IDs**: Cannot contain spaces - use underscores
+- **Tool summaries**: Cannot contain spaces - use underscores
+- **Tool types**: \`flowmodule\` for scripts/flows, \`mcp\` for MCP server tools
+
+## Resource Types
+On Windmill, credentials and configuration are stored in resources. Resource types define the format of the resource.
+- Use the \`resource_type\` tool to search for available resource types (e.g. stripe, google, postgresql, etc.)
+- If the user needs a resource as flow input, set the property type in the schema to "object" and add a key called "format" set to "resource-nameofresourcetype" (e.g. "resource-stripe")
+- If the user wants a specific resource as step input, set the step value to a static string in the format: "$res:path/to/resource"
+
+### OpenFlow Schema Reference
+Below is the complete OpenAPI schema for OpenFlow. All field descriptions and behaviors are defined here. Refer to this as the authoritative reference when generating flow JSON:
+
+\`\`\`json
+${formatOpenFlowSchemaForPrompt()}
+\`\`\`
+
+The schema includes detailed descriptions for:
+- **FlowModuleValue types**: rawscript, script, flow, forloopflow, whileloopflow, branchone, branchall, identity, aiagent
+- **Module configuration**: stop_after_if, skip_if, suspend, sleep, cache_ttl, retry, mock, timeout
+- **InputTransform**: static vs javascript, available variables (results, flow_input, flow_input.iter)
+- **Special modules**: preprocessor_module, failure_module
+- **Loop options**: iterator, parallel, parallelism, skip_failures
+- **Branch types**: BranchOne (first match), BranchAll (all execute)
+`
+
+/**
+ * Minimal single-tool variant.
+ * Replaces granular flow editing tools (add_module, remove_module, modify_module, etc.)
+ * with a single set_flow_json tool, while keeping all other utility tools.
+ * Uses the default system prompt.
+ */
+export const MINIMAL_SINGLE_TOOL_VARIANT: VariantConfig = {
+	name: 'minimal-single-tool',
+	description:
+		'Default prompt with set_flow_json instead of granular flow editing tools, keeps all utility tools',
+	systemPrompt: {
+		type: 'custom',
+		content: MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT
+	},
+	tools: {
+		type: 'custom',
+		tools: buildMinimalSingleToolTools()
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/no-full-schema.ts b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/no-full-schema.ts
new file mode 100644
index 0000000000000..ee269fb6b377f
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/flow/__tests__/eval/variants/no-full-schema.ts
@@ -0,0 +1,914 @@
+import type { ChatCompletionFunctionTool } from 'openai/resources/chat/completions.mjs'
+import type { VariantConfig } from '../evalVariants'
+import { flowTools } from '../../../core'
+import type { Tool } from '../../../../shared'
+import type { FlowAIChatHelpers } from '../../../core'
+import { findModuleInFlow } from '../../../utils'
+import type { FlowModule } from '$lib/gen'
+
+/**
+ * Simplified InputTransform schema (inline, no $ref)
+ */
+const inputTransformSchema = {
+	oneOf: [
+		{
+			type: 'object',
+			description:
+				"Static value passed directly. For resources, use format '$res:path/to/resource'",
+			properties: {
+				value: { description: 'The static value' },
+				type: { type: 'string', enum: ['static'] }
+			},
+			required: ['type']
+		},
+		{
+			type: 'object',
+			description:
+				"JavaScript expression evaluated at runtime. Use 'results.step_id' or 'flow_input.property'. Inside loops, use 'flow_input.iter.value'",
+			properties: {
+				expr: { type: 'string', description: 'JavaScript expression returning the value' },
+				type: { type: 'string', enum: ['javascript'] }
+			},
+			required: ['expr', 'type']
+		}
+	]
+}
+
+/**
+ * Simplified FlowModuleValue schema without circular references.
+ * Container types (forloopflow, whileloopflow, branchone, branchall, aiagent)
+ * have their nested modules/tools arrays marked as must-be-empty.
+ */
+const simplifiedFlowModuleValueSchema = {
+	description:
+		'The module implementation. For containers (loops, branches), modules array must be empty - use add_module with insideId to add steps inside.',
+	oneOf: [
+		// RawScript - no nested modules, keep full schema
+		{
+			type: 'object',
+			description:
+				"Inline script with code. Use 'bun' as default language. Script receives arguments from input_transforms",
+			properties: {
+				type: { type: 'string', enum: ['rawscript'] },
+				content: {
+					type: 'string',
+					description: "Script source code. Should export a 'main' function"
+				},
+				language: {
+					type: 'string',
+					enum: [
+						'deno',
+						'bun',
+						'python3',
+						'go',
+						'bash',
+						'powershell',
+						'postgresql',
+						'mysql',
+						'bigquery',
+						'snowflake',
+						'mssql',
+						'oracledb',
+						'graphql',
+						'nativets',
+						'php'
+					],
+					description: 'Programming language'
+				},
+				input_transforms: {
+					type: 'object',
+					description: 'Map of parameter names to values (static or JavaScript expressions)',
+					additionalProperties: inputTransformSchema
+				},
+				path: { type: 'string', description: 'Optional path for saving this script' },
+				lock: { type: 'string', description: 'Lock file content for dependencies' },
+				tag: { type: 'string', description: 'Worker group tag for execution routing' },
+				concurrent_limit: { type: 'number' },
+				concurrency_time_window_s: { type: 'number' },
+				custom_concurrency_key: { type: 'string' },
+				is_trigger: { type: 'boolean' }
+			},
+			required: ['type', 'content', 'language', 'input_transforms']
+		},
+		// PathScript - reference to existing script
+		{
+			type: 'object',
+			description: 'Reference to an existing script by path',
+			properties: {
+				type: { type: 'string', enum: ['script'] },
+				path: { type: 'string', description: "Path to script (e.g., 'f/scripts/send_email')" },
+				hash: { type: 'string', description: 'Optional specific version hash' },
+				input_transforms: {
+					type: 'object',
+					description: 'Map of parameter names to values',
+					additionalProperties: inputTransformSchema
+				},
+				tag_override: { type: 'string' },
+				is_trigger: { type: 'boolean' }
+			},
+			required: ['type', 'path', 'input_transforms']
+		},
+		// PathFlow - reference to existing flow
+		{
+			type: 'object',
+			description: 'Reference to an existing flow as a subflow',
+			properties: {
+				type: { type: 'string', enum: ['flow'] },
+				path: { type: 'string', description: "Path to flow (e.g., 'f/flows/process_user')" },
+				input_transforms: {
+					type: 'object',
+					description: 'Map of parameter names to values',
+					additionalProperties: inputTransformSchema
+				}
+			},
+			required: ['type', 'path', 'input_transforms']
+		},
+		// ForloopFlow - modules MUST be empty
+		{
+			type: 'object',
+			description:
+				"For loop over an iterator. IMPORTANT: 'modules' must be an empty array []. Use add_module with insideId and branchPath='modules' to add steps inside.",
+			properties: {
+				type: { type: 'string', enum: ['forloopflow'] },
+				modules: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'loop_id', branchPath: 'modules', value: {...} }) to add steps"
+				},
+				iterator: {
+					...inputTransformSchema,
+					description:
+						"JavaScript expression returning array to iterate. Use 'flow_input.iter.value' inside loop to access current item"
+				},
+				skip_failures: {
+					type: 'boolean',
+					description: "If true, iteration failures don't stop the loop"
+				},
+				parallel: {
+					type: 'boolean',
+					description: 'If true, iterations run concurrently'
+				},
+				parallelism: {
+					...inputTransformSchema,
+					description: 'Max concurrent iterations when parallel=true'
+				}
+			},
+			required: ['type', 'modules', 'iterator', 'skip_failures']
+		},
+		// WhileloopFlow - modules MUST be empty
+		{
+			type: 'object',
+			description:
+				"While loop that repeats until stop_after_if triggers. IMPORTANT: 'modules' must be an empty array []. Use add_module to add steps inside.",
+			properties: {
+				type: { type: 'string', enum: ['whileloopflow'] },
+				modules: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'loop_id', branchPath: 'modules', value: {...} }) to add steps"
+				},
+				skip_failures: {
+					type: 'boolean',
+					description: "If true, iteration failures don't stop the loop"
+				},
+				parallel: { type: 'boolean' },
+				parallelism: inputTransformSchema
+			},
+			required: ['type', 'modules', 'skip_failures']
+		},
+		// BranchOne - branches and default MUST be empty
+		{
+			type: 'object',
+			description:
+				"Conditional branching (first match wins). IMPORTANT: Create with empty 'branches' array [], then use add_module with branchPath=null to add branches, and branchPath='branches.N' to add modules inside branches.",
+			properties: {
+				type: { type: 'string', enum: ['branchone'] },
+				branches: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'branch_id', branchPath: null, value: { summary, expr, modules: [] } }) to add branches"
+				},
+				default: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'branch_id', branchPath: 'default', value: {...} }) to add steps to default branch"
+				}
+			},
+			required: ['type', 'branches', 'default']
+		},
+		// BranchAll - branches MUST be empty
+		{
+			type: 'object',
+			description:
+				"Parallel branching (all branches execute). IMPORTANT: Create with empty 'branches' array [], then use add_module with branchPath=null to add branches.",
+			properties: {
+				type: { type: 'string', enum: ['branchall'] },
+				branches: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'branch_id', branchPath: null, value: { summary, skip_failure, modules: [] } }) to add branches"
+				},
+				parallel: {
+					type: 'boolean',
+					description: 'If true, all branches execute concurrently'
+				}
+			},
+			required: ['type', 'branches']
+		},
+		// Identity - pass-through
+		{
+			type: 'object',
+			description: 'Pass-through module that returns input unchanged',
+			properties: {
+				type: { type: 'string', enum: ['identity'] },
+				flow: { type: 'boolean' }
+			},
+			required: ['type']
+		},
+		// AiAgent - tools MUST be empty
+		{
+			type: 'object',
+			description:
+				"AI agent step. IMPORTANT: 'tools' must be an empty array []. Use add_module with branchPath='tools' to add tools.",
+			properties: {
+				type: { type: 'string', enum: ['aiagent'] },
+				input_transforms: {
+					type: 'object',
+					description: 'Agent input parameters',
+					properties: {
+						provider: inputTransformSchema,
+						output_type: inputTransformSchema,
+						user_message: inputTransformSchema,
+						system_prompt: inputTransformSchema,
+						streaming: inputTransformSchema,
+						messages_context_length: inputTransformSchema,
+						output_schema: inputTransformSchema,
+						user_images: inputTransformSchema,
+						max_completion_tokens: inputTransformSchema,
+						temperature: inputTransformSchema
+					},
+					required: ['provider', 'user_message', 'output_type']
+				},
+				tools: {
+					type: 'array',
+					items: {},
+					description:
+						"MUST be empty []. Use add_module({ insideId: 'agent_id', branchPath: 'tools', value: {...} }) to add tools"
+				},
+				parallel: { type: 'boolean' }
+			},
+			required: ['type', 'tools', 'input_transforms']
+		}
+	]
+}
+
+/**
+ * Simplified FlowModule schema without circular references in nested modules
+ */
+const simplifiedFlowModuleSchema = {
+	type: 'object',
+	description: 'A single step in a flow',
+	properties: {
+		id: {
+			type: 'string',
+			description:
+				"Unique identifier. Used to reference results via 'results.step_id'. Must be alphanumeric with underscores/hyphens"
+		},
+		value: simplifiedFlowModuleValueSchema,
+		summary: { type: 'string', description: 'Short description of what this step does' },
+		stop_after_if: {
+			type: 'object',
+			description: 'Early termination condition evaluated after step completes',
+			properties: {
+				skip_if_stopped: { type: 'boolean' },
+				expr: { type: 'string', description: "Expression using 'result' or 'flow_input'" },
+				error_message: { type: 'string' }
+			},
+			required: ['expr']
+		},
+		stop_after_all_iters_if: {
+			type: 'object',
+			description: 'For loops - condition evaluated after all iterations',
+			properties: {
+				skip_if_stopped: { type: 'boolean' },
+				expr: { type: 'string' },
+				error_message: { type: 'string' }
+			},
+			required: ['expr']
+		},
+		skip_if: {
+			type: 'object',
+			properties: {
+				expr: { type: 'string', description: 'Expression returning true to skip this step' }
+			},
+			required: ['expr']
+		},
+		sleep: {
+			...inputTransformSchema,
+			description: 'Delay before executing (seconds)'
+		},
+		cache_ttl: { type: 'number', description: 'Cache duration in seconds' },
+		timeout: {
+			...inputTransformSchema,
+			description: 'Max execution time in seconds'
+		},
+		delete_after_use: { type: 'boolean' },
+		mock: {
+			type: 'object',
+			properties: {
+				enabled: { type: 'boolean' },
+				return_value: {}
+			}
+		},
+		suspend: {
+			type: 'object',
+			description: 'Approval/resume configuration',
+			properties: {
+				required_events: { type: 'integer' },
+				timeout: { type: 'integer' },
+				resume_form: {
+					type: 'object',
+					properties: { schema: { type: 'object' } }
+				},
+				user_auth_required: { type: 'boolean' },
+				user_groups_required: inputTransformSchema,
+				self_approval_disabled: { type: 'boolean' },
+				hide_cancel: { type: 'boolean' },
+				continue_on_disapprove_timeout: { type: 'boolean' }
+			}
+		},
+		priority: { type: 'number' },
+		continue_on_error: { type: 'boolean' },
+		retry: {
+			type: 'object',
+			properties: {
+				constant: {
+					type: 'object',
+					properties: {
+						attempts: { type: 'integer' },
+						seconds: { type: 'integer' }
+					}
+				},
+				exponential: {
+					type: 'object',
+					properties: {
+						attempts: { type: 'integer' },
+						multiplier: { type: 'integer' },
+						seconds: { type: 'integer', minimum: 1 },
+						random_factor: { type: 'integer', minimum: 0, maximum: 100 }
+					}
+				},
+				retry_if: {
+					type: 'object',
+					properties: {
+						expr: { type: 'string' }
+					},
+					required: ['expr']
+				}
+			}
+		}
+	},
+	required: ['value', 'id']
+}
+
+/**
+ * Custom add_module tool definition with simplified schema (no circular refs)
+ */
+const noSchemaAddModuleToolDef: ChatCompletionFunctionTool = {
+	type: 'function',
+	function: {
+		strict: false,
+		name: 'add_module',
+		description:
+			"Add a new module to the flow. For containers (loops, branches, agents), add with EMPTY modules array, then use additional add_module calls to add steps inside. Reserved IDs: 'failure', 'preprocessor', 'Input'.",
+		parameters: {
+			type: 'object',
+			properties: {
+				afterId: {
+					type: ['string', 'null'],
+					description: 'ID of module to insert after. Use null to insert at beginning.'
+				},
+				insideId: {
+					type: ['string', 'null'],
+					description:
+						'ID of container module (branch/loop/agent) to insert into. Use with branchPath.'
+				},
+				branchPath: {
+					type: ['string', 'null'],
+					description:
+						"Path inside container: 'modules' (loops), 'branches.0'/'branches.1'/etc (specific branch), 'default' (branchone default), 'tools' (aiagent). Use null with insideId to add NEW branch to branchall/branchone."
+				},
+				value: simplifiedFlowModuleSchema
+			},
+			required: ['value']
+		}
+	}
+}
+
+/**
+ * Custom modify_module tool definition with simplified schema (no circular refs)
+ */
+const noSchemaModifyModuleToolDef: ChatCompletionFunctionTool = {
+	type: 'function',
+	function: {
+		strict: false,
+		name: 'modify_module',
+		description:
+			"Modify an existing module (full replacement). Use for changing configuration, transforms, or conditions. NOT for adding/removing nested modules - use add_module/remove_module. Reserved IDs: 'failure', 'preprocessor', 'Input'.",
+		parameters: {
+			type: 'object',
+			properties: {
+				id: {
+					type: 'string',
+					description: 'ID of the module to modify'
+				},
+				value: simplifiedFlowModuleSchema
+			},
+			required: ['id', 'value']
+		}
+	}
+}
+
+/**
+ * Dedicated add_branch tool for adding branches to branchone/branchall containers.
+ * This makes it clearer how to add branches without nested modules.
+ */
+const addBranchToolDef: ChatCompletionFunctionTool = {
+	type: 'function',
+	function: {
+		strict: false,
+		name: 'add_branch',
+		description:
+			'Add a new branch to a branchone or branchall container. The branch will have an empty modules array - use add_module with insideId and branchPath to add steps inside the branch.',
+		parameters: {
+			type: 'object',
+			properties: {
+				containerId: {
+					type: 'string',
+					description: 'ID of the branchone or branchall container to add a branch to'
+				},
+				summary: {
+					type: 'string',
+					description:
+						'Short description of the branch (e.g., "Handle admin users", "Process errors")'
+				},
+				expr: {
+					type: 'string',
+					description:
+						"JavaScript expression for branchone only. Return true to execute this branch. Can use 'results.step_id' or 'flow_input'. Example: \"results.check_role === 'admin'\""
+				},
+				skip_failure: {
+					type: 'boolean',
+					description:
+						'For branchall only. If true, failure in this branch does not fail the entire flow. Default: false'
+				}
+			},
+			required: ['containerId']
+		}
+	}
+}
+
+/**
+ * Implementation for add_branch tool.
+ * Adds a new branch with empty modules array to a branchone or branchall container.
+ */
+async function addBranchImpl({
+	helpers,
+	args
+}: {
+	args: { containerId: string; summary?: string; expr?: string; skip_failure?: boolean }
+	helpers: FlowAIChatHelpers
+}): Promise<string> {
+	const { containerId, summary = '', expr, skip_failure = false } = args
+
+	const flow = helpers.getFlowAndSelectedId().flow
+	const container = findModuleInFlow(flow.value.modules, containerId)
+
+	if (!container) {
+		return `Error: Container with ID '${containerId}' not found in flow`
+	}
+
+	if (container.value.type !== 'branchone' && container.value.type !== 'branchall') {
+		return `Error: Module '${containerId}' is not a branchone or branchall container (type: ${container.value.type})`
+	}
+
+	// Add branch to the container
+	if (container.value.type === 'branchone') {
+		const newBranch = {
+			summary: summary,
+			expr: expr || 'false',
+			modules: [] as FlowModule[]
+		}
+		container.value.branches = [...(container.value.branches || []), newBranch]
+		const branchIndex = container.value.branches.length - 1
+		helpers.setFlowJson(JSON.stringify(flow))
+		return `Added branch ${branchIndex} to branchone '${containerId}' with expr: "${newBranch.expr}". Use add_module({ insideId: "${containerId}", branchPath: "branches.${branchIndex}", value: {...} }) to add modules inside this branch.`
+	} else {
+		// branchall
+		const newBranch = {
+			summary: summary,
+			skip_failure: skip_failure,
+			modules: [] as FlowModule[]
+		}
+		container.value.branches = [...(container.value.branches || []), newBranch]
+		const branchIndex = container.value.branches.length - 1
+		helpers.setFlowJson(JSON.stringify(flow))
+		return `Added branch ${branchIndex} to branchall '${containerId}'. Use add_module({ insideId: "${containerId}", branchPath: "branches.${branchIndex}", value: {...} }) to add modules inside this branch.`
+	}
+}
+
+/**
+ * Additional system prompt content for container module creation
+ */
+const CONTAINER_MODULE_INSTRUCTIONS = `
+
+## Creating Container Modules (Loops, Branches, AI Agents)
+
+IMPORTANT: When creating container modules, you MUST use a multi-step process:
+
+1. **First**: Add the container module with an EMPTY modules/branches/tools array
+2. **For branches**: Use the \`add_branch\` tool to add branches to branchone/branchall
+3. **Then**: Use separate \`add_module\` calls to add modules inside the container
+
+### For Loops (forloopflow)
+\`\`\`javascript
+// Step 1: Create the loop container (modules MUST be empty [])
+add_module({ afterId: "previous_step", value: {
+  id: "my_loop",
+  value: { type: "forloopflow", modules: [], iterator: { type: "javascript", expr: "results.step_a" }, skip_failures: false }
+}})
+
+// Step 2: Add modules inside the loop
+add_module({ insideId: "my_loop", branchPath: "modules", value: { id: "step_in_loop", value: { type: "rawscript", ... } }})
+\`\`\`
+
+### While Loops (whileloopflow)
+\`\`\`javascript
+// Step 1: Create with empty modules
+add_module({ afterId: "previous_step", value: {
+  id: "my_while",
+  value: { type: "whileloopflow", modules: [], skip_failures: false }
+}})
+
+// Step 2: Add modules inside (use stop_after_if to control loop termination)
+add_module({ insideId: "my_while", branchPath: "modules", value: {
+  id: "check_condition",
+  stop_after_if: { expr: "result.done === true" },
+  value: { type: "rawscript", ... }
+}})
+\`\`\`
+
+### Conditional Branches (branchone)
+\`\`\`javascript
+// Step 1: Create branch container with empty arrays
+add_module({ afterId: "previous_step", value: {
+  id: "my_branch",
+  value: { type: "branchone", branches: [], default: [] }
+}})
+
+// Step 2: Use add_branch to add conditional branches
+add_branch({ containerId: "my_branch", summary: "Condition 1", expr: "results.step_a > 10" })
+add_branch({ containerId: "my_branch", summary: "Condition 2", expr: "results.step_a < 0" })
+
+// Step 3: Add modules inside each branch (branches.0, branches.1, etc.)
+add_module({ insideId: "my_branch", branchPath: "branches.0", value: { id: "step_if_positive", value: {...} }})
+add_module({ insideId: "my_branch", branchPath: "branches.1", value: { id: "step_if_negative", value: {...} }})
+
+// Step 4: Add modules to default branch (executed if no conditions match)
+add_module({ insideId: "my_branch", branchPath: "default", value: { id: "step_default", value: {...} }})
+\`\`\`
+
+### Parallel Branches (branchall)
+\`\`\`javascript
+// Step 1: Create with empty branches
+add_module({ afterId: "previous_step", value: {
+  id: "parallel_tasks",
+  value: { type: "branchall", branches: [], parallel: true }
+}})
+
+// Step 2: Use add_branch to add parallel branches
+add_branch({ containerId: "parallel_tasks", summary: "Branch A" })
+add_branch({ containerId: "parallel_tasks", summary: "Branch B", skip_failure: true })
+
+// Step 3: Add modules to each branch (branches.0, branches.1, etc.)
+add_module({ insideId: "parallel_tasks", branchPath: "branches.0", value: { id: "task_a", value: {...} }})
+add_module({ insideId: "parallel_tasks", branchPath: "branches.1", value: { id: "task_b", value: {...} }})
+\`\`\`
+
+### AI Agents (aiagent)
+\`\`\`javascript
+// Step 1: Create agent with empty tools
+add_module({ afterId: "previous_step", value: {
+  id: "my_agent",
+  value: {
+    type: "aiagent",
+    tools: [],
+    input_transforms: {
+      provider: { type: "static", value: "openai" },
+      user_message: { type: "javascript", expr: "flow_input.question" },
+      output_type: { type: "static", value: "text" }
+    }
+  }
+}})
+
+// Step 2: Add tools to the agent
+add_module({ insideId: "my_agent", branchPath: "tools", value: {
+  id: "search_tool",
+  summary: "search_database",
+  value: { tool_type: "flowmodule", type: "rawscript", language: "bun", content: "...", input_transforms: {} }
+}})
+\`\`\`
+`
+
+/**
+ * Full system prompt for the no-full-schema variant
+ */
+const NO_FULL_SCHEMA_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits workflows on the Windmill platform.
+
+## IMPORTANT RULES
+
+**Reserved IDs - Do NOT use these in add_module, modify_module, or remove_module:**
+- \`failure\` - Reserved for failure handler module
+- \`preprocessor\` - Reserved for preprocessor module
+- \`Input\` - Reserved for flow input reference
+
+## Tool Selection Guide
+
+**Flow Modification:**
+- **Add a new module** → \`add_module\`
+- **Remove a module** → \`remove_module\`
+- **Add a new branch to branchall/branchone** → \`add_branch\` (NOT add_module)
+- **Remove a branch from branchall/branchone** → \`remove_branch\`
+- **Change module code only** → \`set_module_code\`
+- **Change module config/transforms/conditions** → \`modify_module\`
+- **Update flow input parameters** → \`set_flow_schema\`
+
+**Code & Scripts:**
+- **View existing inline script code** → \`inspect_inline_script\`
+- **Get language-specific coding instructions** → \`get_instructions_for_code_generation\` (call BEFORE writing code)
+- **Find workspace scripts** → \`search_scripts\`
+- **Find Windmill Hub scripts** → \`search_hub_scripts\`
+
+**Testing:**
+- **Test entire flow** → \`test_run_flow\`
+- **Test single step** → \`test_run_step\`
+
+**Resources & Schema:**
+- **Search resource types** → \`resource_type\`
+- **Get database schema** → \`get_db_schema\`
+
+## Common Mistakes to Avoid
+
+- **Don't use \`modify_module\` to add/remove nested modules** - Use \`add_module\`/\`remove_module\` instead
+- **Don't forget \`input_transforms\`** - Rawscript parameters won't receive values without them
+- **Don't use spaces in module IDs** - Use underscores (e.g., \`fetch_data\` not \`fetch data\`)
+- **Don't reference future steps** - \`results.step_id\` only works for steps that execute before the current one
+- **Don't create duplicate IDs** - Each module ID must be unique in the flow. Always generate fresh, unique IDs for new modules. Never reuse IDs from existing or previously removed modules
+- **Don't provide nested modules directly** - Always create containers with empty arrays, then add modules via separate add_module calls
+${CONTAINER_MODULE_INSTRUCTIONS}
+## User Instructions
+
+Follow the user instructions carefully.
+At the end of your changes, explain precisely what you did and what the flow does now.
+ALWAYS test your modifications. You have access to the \`test_run_flow\` and \`test_run_step\` tools to test the flow and steps. If you only modified a single step, use the \`test_run_step\` tool to test it. If you modified the flow, use the \`test_run_flow\` tool to test it. If the user cancels the test run, do not try again and wait for the next user instruction.
+When testing steps that are sql scripts, the arguments to be passed are { database: $res:<db_resource> }.
+
+### Inline Script References (Token Optimization)
+
+To reduce token usage, rawscript content in the flow you receive is replaced with references in the format \`inline_script.{module_id}\`. For example:
+
+\`\`\`json
+{
+  "id": "step_a",
+  "value": {
+    "type": "rawscript",
+    "content": "inline_script.step_a",
+    "language": "bun"
+  }
+}
+\`\`\`
+
+**To modify existing script code:**
+- Use \`set_module_code\` tool for code-only changes: \`set_module_code({ moduleId: "step_a", code: "..." })\`
+
+**To add a new inline script module:**
+- Use \`add_module\` with the full code content directly (not a reference)
+- Avoid coding in single lines, always use multi-line code blocks.
+- The system will automatically store and optimize it
+
+**To inspect existing code:**
+- Use \`inspect_inline_script\` tool to view the current code: \`inspect_inline_script({ moduleId: "step_a" })\`
+
+### Input Transforms for Rawscripts
+
+Rawscript modules use \`input_transforms\` to map function parameters to values. Each key in \`input_transforms\` corresponds to a parameter name in your script's \`main\` function.
+
+**Transform Types:**
+- \`static\`: Fixed value passed directly
+- \`javascript\`: Dynamic expression evaluated at runtime
+
+**Available Variables in JavaScript Expressions:**
+- \`flow_input.{property}\` - Access flow input parameters
+- \`results.{step_id}\` - Access output from a previous step
+- \`flow_input.iter.value\` - Current item when inside a for-loop
+- \`flow_input.iter.index\` - Current index when inside a for-loop
+
+**Example - Rawscript using flow input and previous step result:**
+\`\`\`json
+{
+  "id": "step_b",
+  "value": {
+    "type": "rawscript",
+    "language": "bun",
+    "content": "export async function main(userId: string, data: any[]) {
+		return "Hello, world!";
+	}",
+    "input_transforms": {
+      "userId": {
+        "type": "javascript",
+        "expr": "flow_input.user_id"
+      },
+      "data": {
+        "type": "javascript",
+        "expr": "results.step_a"
+      }
+    }
+  }
+}
+\`\`\`
+
+**Example - Static value:**
+\`\`\`json
+{
+  "input_transforms": {
+    "limit": {
+      "type": "static",
+      "value": 100
+    }
+  }
+}
+\`\`\`
+
+**Important:** The parameter names in \`input_transforms\` must match the function parameter names in your script. When you create or modify a rawscript, always define \`input_transforms\` to connect it to flow inputs or results from other steps.
+
+### Other Key Concepts
+- **Resources**: For flow inputs, use type "object" with format "resource-<type>". For step inputs, use "$res:path/to/resource"
+- **Module IDs**: Must be unique and valid identifiers. Used to reference results via \`results.step_id\`
+- **Module types**: Use 'bun' as default language for rawscript if unspecified
+
+### Writing Code for Modules
+
+**IMPORTANT: Before writing any code for a rawscript module, you MUST call the \`get_instructions_for_code_generation\` tool with the target language.** This tool provides essential language-specific instructions.
+
+Always call this tool first when:
+- Creating a new rawscript module
+- Modifying existing code in a module
+- Setting code via \`set_module_code\`
+
+Example: Before writing TypeScript/Bun code, call \`get_instructions_for_code_generation({ language: "bun" })\`
+
+### Creating New Steps
+
+1. **Search for existing scripts first** (unless user explicitly asks to write from scratch):
+   - First: \`search_scripts\` to find workspace scripts
+   - Then: \`search_hub_scripts\` (only consider highly relevant results)
+   - Only create a raw script if no suitable script is found
+
+2. **Add the module using \`add_module\`:**
+   - If using existing script: \`add_module({ afterId: "previous_step", value: { id: "new_step", value: { type: "script", path: "f/folder/script" } } })\`
+   - If creating rawscript:
+     - Default language is 'bun' if not specified
+     - **First call \`get_instructions_for_code_generation\` to get the correct code format**
+     - Include full code in the content field
+     - Always define \`input_transforms\` to connect parameters to flow inputs or previous step results
+
+3. **Update flow schema if needed:**
+   - If your module references flow_input properties that don't exist yet, add them using \`set_flow_schema\`
+
+### AI Agent Tools
+
+AI agents can use tools to accomplish tasks. To manage tools for an AI agent:
+
+- **Adding a tool to an AI agent**: Use \`add_module\` with \`insideId\` set to the agent's ID and \`branchPath: "tools"\`
+  - Tool order doesn't affect execution, so you can omit \`afterId\` (defaults to inserting at beginning)
+  - Example: \`add_module({ insideId: "ai_agent_step", branchPath: "tools", value: { id: "search_docs", summary: "Search documentation", value: { tool_type: "flowmodule", type: "rawscript", language: "bun", content: "...", input_transforms: {} } } })\`
+
+- **Removing a tool from an AI agent**: Use \`remove_module\` with the tool's ID
+  - The tool will be found and removed from the agent's tools array
+
+- **Modifying a tool**: Use \`modify_module\` with the tool's ID
+  - Example: \`modify_module({ id: "search_docs", value: { ... } })\`
+
+- **Tool IDs**: Cannot contain spaces - use underscores (e.g., \`get_user_data\` not \`get user data\`)
+- **Tool summaries**: Unlike other module summaries, tool summaries cannot contain spaces, use underscores instead.
+
+- **Tool types**:
+  - \`flowmodule\`: A script/flow that the agent can call (same as regular flow modules but with \`tool_type: "flowmodule"\`)
+  - \`mcp\`: Reference to an MCP server tool
+
+**Example - Adding a rawscript tool to an agent:**
+\`\`\`json
+add_module({
+  insideId: "my_agent",
+  branchPath: "tools",
+  value: {
+    id: "fetch_weather",
+    summary: "Get current weather for a location",
+    value: {
+      tool_type: "flowmodule",
+      type: "rawscript",
+      language: "bun",
+      content: "export async function main(location: string) { ... }",
+      input_transforms: {
+        location: { type: "static", value: "" }
+      }
+    }
+  }
+})
+\`\`\`
+
+## Resource Types
+On Windmill, credentials and configuration are stored in resources. Resource types define the format of the resource.
+- Use the \`resource_type\` tool to search for available resource types (e.g. stripe, google, postgresql, etc.)
+- If the user needs a resource as flow input, set the property type in the schema to "object" and add a key called "format" set to "resource-nameofresourcetype" (e.g. "resource-stripe")
+- If the user wants a specific resource as step input, set the step value to a static string in the format: "$res:path/to/resource"
+
+### Contexts
+
+You have access to the following contexts:
+- Database schemas: Schema of databases the user is using
+- Flow diffs: Diff between current flow and last deployed flow
+- Focused flow modules: IDs of modules the user is focused on. Your response should focus on these modules
+`
+
+/**
+ * Get the production tool by name
+ */
+function getProductionTool(name: string): Tool<FlowAIChatHelpers> | undefined {
+	return flowTools.find((t) => t.def.function.name === name)
+}
+
+/**
+ * Build the tools array for the no-full-schema variant.
+ * Uses all production tools except add_module and modify_module,
+ * which are replaced with simplified schema versions.
+ * Also adds a dedicated add_branch tool for clearer branch creation.
+ */
+function buildNoSchemaTools(): Tool<FlowAIChatHelpers>[] {
+	const productionAddModule = getProductionTool('add_module')
+	const productionModifyModule = getProductionTool('modify_module')
+
+	if (!productionAddModule || !productionModifyModule) {
+		throw new Error('Could not find add_module or modify_module in production tools')
+	}
+
+	// Get all production tools except add_module and modify_module
+	const otherTools = flowTools.filter(
+		(t) => t.def.function.name !== 'add_module' && t.def.function.name !== 'modify_module'
+	)
+
+	// Create custom tools with simplified schemas but same implementations
+	const customAddModule: Tool<FlowAIChatHelpers> = {
+		...productionAddModule,
+		def: noSchemaAddModuleToolDef
+	}
+
+	const customModifyModule: Tool<FlowAIChatHelpers> = {
+		...productionModifyModule,
+		def: noSchemaModifyModuleToolDef
+	}
+
+	// Create the add_branch tool with custom implementation
+	const addBranchTool: Tool<FlowAIChatHelpers> = {
+		def: addBranchToolDef,
+		fn: addBranchImpl
+	}
+
+	return [...otherTools, customAddModule, customModifyModule, addBranchTool]
+}
+
+/**
+ * No-Full-Schema Variant
+ *
+ * Uses simplified tool schemas that avoid circular references.
+ * Container types (loops, branches, agents) must have empty modules arrays,
+ * and the LLM must use separate add_module calls to add steps inside.
+ */
+export const NO_FULL_SCHEMA_VARIANT: VariantConfig = {
+	name: 'no-full-schema',
+	description:
+		'Simplified tool schemas without circular refs. Containers require nested add_module calls.',
+	systemPrompt: {
+		type: 'custom',
+		content: NO_FULL_SCHEMA_SYSTEM_PROMPT
+	},
+	tools: {
+		type: 'custom',
+		tools: buildNoSchemaTools()
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/flow/core.ts b/frontend/src/lib/components/copilot/chat/flow/core.ts
index 55203709171b8..4f9b6c5ac9a50 100644
--- a/frontend/src/lib/components/copilot/chat/flow/core.ts
+++ b/frontend/src/lib/components/copilot/chat/flow/core.ts
@@ -276,8 +276,6 @@ class WorkspaceScriptsSearch {
 const testRunFlowSchema = z.object({
 	args: z
 		.object({})
-		.nullable()
-		.optional()
 		.describe('Arguments to pass to the flow (optional, uses default flow inputs if not provided)')
 })
 
@@ -894,7 +892,7 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
  * Formats the OpenFlow schema for inclusion in the AI system prompt.
  * Extracts only the component schemas and formats them as JSON for the AI to reference.
  */
-function formatOpenFlowSchemaForPrompt(): string {
+export function formatOpenFlowSchemaForPrompt(): string {
 	const schemas = openFlowSchema.components?.schemas
 	if (!schemas) {
 		return 'Schema not available'
diff --git a/frontend/src/lib/test-setup.ts b/frontend/src/lib/test-setup.ts
new file mode 100644
index 0000000000000..d34a9369afa69
--- /dev/null
+++ b/frontend/src/lib/test-setup.ts
@@ -0,0 +1,60 @@
+/**
+ * Vitest setup file to mock browser globals for testing
+ */
+
+// Mock localStorage
+const localStorageMock = {
+	store: {} as Record<string, string>,
+	getItem(key: string) {
+		return this.store[key] ?? null
+	},
+	setItem(key: string, value: string) {
+		this.store[key] = value
+	},
+	removeItem(key: string) {
+		delete this.store[key]
+	},
+	clear() {
+		this.store = {}
+	},
+	get length() {
+		return Object.keys(this.store).length
+	},
+	key(index: number) {
+		return Object.keys(this.store)[index] ?? null
+	}
+}
+
+// Mock sessionStorage
+const sessionStorageMock = {
+	store: {} as Record<string, string>,
+	getItem(key: string) {
+		return this.store[key] ?? null
+	},
+	setItem(key: string, value: string) {
+		this.store[key] = value
+	},
+	removeItem(key: string) {
+		delete this.store[key]
+	},
+	clear() {
+		this.store = {}
+	},
+	get length() {
+		return Object.keys(this.store).length
+	},
+	key(index: number) {
+		return Object.keys(this.store)[index] ?? null
+	}
+}
+
+// Assign to globalThis
+Object.defineProperty(globalThis, 'localStorage', {
+	value: localStorageMock,
+	writable: true
+})
+
+Object.defineProperty(globalThis, 'sessionStorage', {
+	value: sessionStorageMock,
+	writable: true
+})
\ No newline at end of file
diff --git a/frontend/vite.config.js b/frontend/vite.config.js
index 7a8323ae7ad76..142c75f50045d 100644
--- a/frontend/vite.config.js
+++ b/frontend/vite.config.js
@@ -117,7 +117,8 @@ const config = {
 					name: 'server',
 					environment: 'node',
 					include: ['src/**/*.{test,spec}.{js,ts}'],
-					exclude: ['src/**/*.svelte.{test,spec}.{js,ts}']
+					exclude: ['src/**/*.svelte.{test,spec}.{js,ts}'],
+					setupFiles: ['src/lib/test-setup.ts']
 				}
 			}
 		]