diff --git a/src/ai-sdk/middleware/index.ts b/src/ai-sdk/middleware/index.ts index 37a8a9b5..fc0c7b42 100644 --- a/src/ai-sdk/middleware/index.ts +++ b/src/ai-sdk/middleware/index.ts @@ -3,6 +3,10 @@ export { type PlaceholderOptions, type PlaceholderResult, } from "./placeholder"; +export { + type PrerenderFallbackOptions, + prerenderFallbackMiddleware, +} from "./prerender"; export { type ImagePlaceholderFallbackOptions, imagePlaceholderFallbackMiddleware, diff --git a/src/ai-sdk/middleware/prerender.ts b/src/ai-sdk/middleware/prerender.ts new file mode 100644 index 00000000..c0bd2782 --- /dev/null +++ b/src/ai-sdk/middleware/prerender.ts @@ -0,0 +1,205 @@ +import { unlink } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { ImageModelV3 } from "@ai-sdk/provider"; +import type { generateImage } from "ai"; +import type { VideoModelV3CallOptions } from "../video-model"; +import type { VideoModelMiddleware } from "./wrap-video-model"; + +export interface PrerenderFallbackOptions { + /** + * Image model to use for generating still frames from text-to-video prompts. + * When a Video element has only a text prompt (no input images), this model + * generates the placeholder image that becomes the still frame. + */ + imageModel: ImageModelV3; + + /** + * The generateImage function to use (should be the cached version). + */ + generateImageFn: typeof generateImage; + + /** + * Callback when a video is replaced with a still frame. + */ + onPrerender?: (prompt: string, hasInputImage: boolean) => void; +} + +/** + * Creates a still-frame video from an image using ffmpeg. + * The video holds the image for the specified duration. + */ +async function imageToStillVideo( + imageData: Uint8Array, + duration: number, + resolution?: string, +): Promise { + const ts = Date.now(); + const rand = Math.random().toString(36).slice(2); + const imgPath = join(tmpdir(), `prerender_img_${ts}_${rand}.png`); + const outPath = join(tmpdir(), `prerender_vid_${ts}_${rand}.mp4`); + + try { + await Bun.write(imgPath, imageData); + + // Parse resolution for scaling, default to 1080x1920 + let scaleFilter = ""; + if (resolution) { + const [w, h] = resolution.split("x").map(Number); + if (w && h) { + scaleFilter = `-vf scale=${w}:${h}:force_original_aspect_ratio=decrease,pad=${w}:${h}:(ow-iw)/2:(oh-ih)/2`; + } + } + + const { $ } = await import("bun"); + + const args = [ + "ffmpeg", + "-y", + "-loop", + "1", + "-i", + imgPath, + "-t", + String(duration), + "-r", + "30", + ...(scaleFilter ? scaleFilter.split(" ") : []), + "-c:v", + "libx264", + "-preset", + "ultrafast", + "-pix_fmt", + "yuv420p", + "-tune", + "stillimage", + outPath, + ]; + + const result = await $`${args}`.quiet().nothrow(); + + if (result.exitCode !== 0) { + const stderr = result.stderr.toString().trim(); + throw new Error( + `ffmpeg still-frame failed (exit ${result.exitCode}): ${stderr || "unknown error"}`, + ); + } + + const data = await Bun.file(outPath).bytes(); + return new Uint8Array(data); + } finally { + await unlink(imgPath).catch(() => {}); + await unlink(outPath).catch(() => {}); + } +} + +/** + * Extracts the first image file from VideoModelV3CallOptions.files. + * Returns the image data if found, undefined otherwise. + */ +async function extractFirstImage( + params: VideoModelV3CallOptions, +): Promise { + if (!params.files) return undefined; + + for (const file of params.files) { + if (file.type === "file" && file.mediaType?.startsWith("image/")) { + if (file.data instanceof Uint8Array) { + return file.data; + } + if (typeof file.data === "string") { + // base64 + return Uint8Array.from(atob(file.data), (c) => c.charCodeAt(0)); + } + } + if (file.type === "url") { + // Fetch the URL to get binary data + try { + const response = await fetch(file.url); + const contentType = response.headers.get("content-type") ?? ""; + if (contentType.startsWith("image/")) { + return new Uint8Array(await response.arrayBuffer()); + } + } catch { + // Skip URLs that can't be fetched + } + } + } + + return undefined; +} + +/** + * Middleware that replaces video generation with still-frame images. + * + * - For i2v (image-to-video): uses the input image as the still frame + * - For t2v (text-to-video): generates an image using the configured image + * model and uses it as the still frame + * + * The resulting video has the exact duration specified in the clip, + * making it suitable for verifying visual-audio sync before expensive + * video generation. + */ +export function prerenderFallbackMiddleware( + options: PrerenderFallbackOptions, +): VideoModelMiddleware { + const { imageModel, generateImageFn, onPrerender } = options; + + return { + wrapGenerate: async ({ doGenerate, params, model }) => { + const duration = params.duration ?? 3; + + // Try to extract an existing image from the input files (i2v case) + const inputImage = await extractFirstImage(params); + + let frameImage: Uint8Array; + + if (inputImage) { + // i2v: use the input image directly as the still frame + frameImage = inputImage; + onPrerender?.(params.prompt, true); + } else { + // t2v: generate an image from the text prompt + const prompt = params.prompt || "placeholder"; + onPrerender?.(prompt, false); + + const { images } = await generateImageFn({ + model: imageModel, + prompt, + n: 1, + aspectRatio: params.aspectRatio, + } as Parameters[0]); + + const firstImage = images[0]; + if (!firstImage?.uint8Array) { + throw new Error( + `prerender: image generation returned no data for prompt: ${prompt.slice(0, 80)}`, + ); + } + frameImage = firstImage.uint8Array; + } + + // Create still-frame video with exact duration + const videoData = await imageToStillVideo( + frameImage, + duration, + params.resolution, + ); + + return { + videos: [videoData], + warnings: [ + { + type: "other" as const, + message: `prerender: still frame (${inputImage ? "i2v input" : "t2v generated"}, ${duration}s)`, + }, + ], + response: { + timestamp: new Date(), + modelId: `prerender:${model.modelId}`, + headers: undefined, + }, + }; + }, + }; +} diff --git a/src/ai-sdk/middleware/wrap-video-model.ts b/src/ai-sdk/middleware/wrap-video-model.ts index fa5fb486..4805e9ab 100644 --- a/src/ai-sdk/middleware/wrap-video-model.ts +++ b/src/ai-sdk/middleware/wrap-video-model.ts @@ -1,7 +1,7 @@ import type { VideoModelV3, VideoModelV3CallOptions } from "../video-model"; import { generatePlaceholder } from "./placeholder"; -export type RenderMode = "strict" | "preview"; +export type RenderMode = "strict" | "preview" | "prerender"; export interface VideoModelMiddleware { transformParams?: (options: { diff --git a/src/cli/commands/index.ts b/src/cli/commands/index.ts index 0ffae02e..c4248574 100644 --- a/src/cli/commands/index.ts +++ b/src/cli/commands/index.ts @@ -7,6 +7,7 @@ export { initCmd, showInitHelp } from "./init.tsx"; export { listCmd, showListHelp } from "./list.tsx"; export { loginCmd } from "./login.tsx"; export { logoutCmd } from "./logout.ts"; +export { prerenderCmd, showPrerenderHelp } from "./prerender.tsx"; export { previewCmd, renderCmd, diff --git a/src/cli/commands/prerender.tsx b/src/cli/commands/prerender.tsx new file mode 100644 index 00000000..a5037b92 --- /dev/null +++ b/src/cli/commands/prerender.tsx @@ -0,0 +1,228 @@ +/** @jsxImportSource react */ + +import { mkdirSync } from "node:fs"; +import { dirname } from "node:path"; +import { defineCommand } from "citty"; +import { Box, Text } from "ink"; +import { render } from "../../react/render"; +import type { DefaultModels } from "../../react/types"; +import { Header, HelpBlock, VargBox, VargText } from "../ui/index.ts"; +import { renderStatic } from "../ui/render.ts"; +import { detectDefaultModels, loadComponent, sharedArgs } from "./render.tsx"; + +const DEFAULT_PRERENDER_IMAGE_MODEL = "nano-banana-2"; + +export const prerenderCmd = defineCommand({ + meta: { + name: "prerender", + description: + "render with real images + speech but still-frame video (no video generation)", + }, + args: { + ...sharedArgs, + "image-model": { + type: "string" as const, + description: `image model for t2v replacement (default: ${DEFAULT_PRERENDER_IMAGE_MODEL})`, + }, + }, + async run({ args }) { + const file = args.file as string; + + if (!file) { + console.error( + "usage: varg prerender [-o output.mp4] [--image-model nano-banana-2]", + ); + process.exit(1); + } + + const component = await loadComponent(file); + + if (!component || component.type !== "render") { + console.error("error: default export must be a element"); + process.exit(1); + } + + const basename = file + .replace(/\.tsx?$/, "") + .split("/") + .pop(); + const outputPath = + (args.output as string) ?? `output/${basename}-prerender.mp4`; + + mkdirSync(dirname(outputPath), { recursive: true }); + + if (!args.quiet) { + console.log(`prerendering ${file} → ${outputPath}`); + } + + const useCache = !args["no-cache"]; + + const defaults = await detectDefaultModels(); + + // Resolve the prerender image model + const imageModelId = + (args["image-model"] as string) ?? DEFAULT_PRERENDER_IMAGE_MODEL; + const prerenderImageModel = await resolvePrerenderImageModel( + imageModelId, + defaults, + ); + + const result = await render(component, { + output: outputPath, + cache: useCache ? (args.cache as string) : undefined, + mode: "prerender", + defaults: { + ...defaults, + prerenderImage: prerenderImageModel, + }, + verbose: args.verbose as boolean, + }); + + if (!args.quiet) { + console.log(`done! ${result.video.byteLength} bytes → ${outputPath}`); + } + + if (args.open) { + const { $ } = await import("bun"); + await $`open ${outputPath}`.quiet(); + } + }, +}); + +/** + * Resolve the prerender image model from a model ID string. + * Uses the same provider detection logic as detectDefaultModels. + */ +async function resolvePrerenderImageModel( + modelId: string, + defaults?: DefaultModels, +) { + // Try varg gateway first + let hasVargKey = !!process.env.VARG_API_KEY; + if (!hasVargKey) { + try { + const { getGlobalApiKey } = await import("../credentials"); + hasVargKey = !!getGlobalApiKey(); + } catch { + // credentials module may not be available + } + } + + if (hasVargKey) { + const { varg } = await import("../../ai-sdk/providers/varg"); + return varg.imageModel(modelId); + } + + // Fall back to fal + const falKey = process.env.FAL_API_KEY ?? process.env.FAL_KEY; + if (falKey) { + const { fal } = await import("../../ai-sdk/providers/fal"); + return fal.imageModel(modelId); + } + + // Fall back to default image model + if (defaults?.image) { + return defaults.image; + } + + throw new Error( + `Cannot resolve prerender image model '${modelId}'. Set VARG_API_KEY or FAL_API_KEY.`, + ); +} + +function PrerenderHelpView() { + const examples = [ + { + command: "varg prerender video.tsx", + description: "prerender to output/video-prerender.mp4", + }, + { + command: "varg prerender video.tsx -o preview.mp4", + description: "custom output path", + }, + { + command: "varg prerender video.tsx --image-model flux-schnell", + description: "use flux-schnell for t2v replacement", + }, + { + command: "varg prerender video.tsx --open", + description: "prerender and open in player", + }, + ]; + + return ( + + + + render with real images and speech but replace video generation with + still frames. generates images for text-to-video clips using a fast + image model (default: {DEFAULT_PRERENDER_IMAGE_MODEL}) and uses input + images directly for image-to-video clips. produces a slideshow video + with exact clip durations for visual-audio sync review. + + + +
USAGE
+ + + varg prerender {""} [options] + + + +
OPTIONS
+ + + -o, --output output path + (default: output/{""}-prerender.mp4) + + + --image-model image model for + t2v replacement (default: {DEFAULT_PRERENDER_IMAGE_MODEL}) + + + -c, --cache cache directory + (default: .cache/ai) + + + --no-cache disable cache + + + -v, --verbose show ffmpeg + commands + + + --open open video after + generation + + + -q, --quiet minimal output + + + +
COST COMPARISON
+ + + prerender generates real speech + images but skips expensive video + + + generation (Kling, Wan, etc.). typical savings: $1-4 per render. + + + preview → free (placeholders only, no AI generation) + + prerender → ~$0.50 (real images + speech, still-frame video) + + render → ~$3-5 (full AI video generation) + + +
EXAMPLES
+ + + +
+ ); +} + +export function showPrerenderHelp() { + renderStatic(); +} diff --git a/src/cli/commands/render.tsx b/src/cli/commands/render.tsx index a93cedfc..46b5bf15 100644 --- a/src/cli/commands/render.tsx +++ b/src/cli/commands/render.tsx @@ -14,7 +14,9 @@ import { Captions, Clip, Image, Music, Overlay, Packshot, Render, Slider, Speech import { fal, elevenlabs, replicate, varg } from "vargai/ai"; `; -async function detectDefaultModels(): Promise { +export async function detectDefaultModels(): Promise< + DefaultModels | undefined +> { const defaults: DefaultModels = {}; // Gateway provider — single key for all models (recommended) @@ -68,7 +70,7 @@ async function resolveDefaultExport(mod: { return component as VargElement; } -async function loadComponent(filePath: string): Promise { +export async function loadComponent(filePath: string): Promise { const resolvedPath = resolve(filePath); const source = await Bun.file(resolvedPath).text(); @@ -141,7 +143,7 @@ async function loadComponent(filePath: string): Promise { } } -const sharedArgs = { +export const sharedArgs = { file: { type: "positional" as const, description: "component file (.tsx)", diff --git a/src/cli/index.ts b/src/cli/index.ts index 26b905af..2837d879 100755 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -21,6 +21,7 @@ import { listCmd, loginCmd, logoutCmd, + prerenderCmd, previewCmd, renderCmd, runCmd, @@ -29,6 +30,7 @@ import { showHelp, showInitHelp, showListHelp, + showPrerenderHelp, showPreviewHelp, showRenderHelp, showRunHelp, @@ -63,6 +65,7 @@ const subcommandHelp: Record void> = { run: showRunHelp, render: showRenderHelp, preview: showPreviewHelp, + prerender: showPrerenderHelp, frame: showFrameHelp, storyboard: showStoryboardHelp, init: showInitHelp, @@ -128,6 +131,7 @@ const main = defineCommand({ init: initCmd, render: renderCmd, preview: previewCmd, + prerender: prerenderCmd, frame: frameCmd, storyboard: storyboardCmd, studio: studioCmd, diff --git a/src/react/renderers/render.ts b/src/react/renderers/render.ts index 029bf9b3..aae80a31 100644 --- a/src/react/renderers/render.ts +++ b/src/react/renderers/render.ts @@ -8,6 +8,7 @@ import { generateVideo } from "../../ai-sdk/generate-video"; import { imagePlaceholderFallbackMiddleware, placeholderFallbackMiddleware, + prerenderFallbackMiddleware, wrapVideoModel, } from "../../ai-sdk/middleware"; import { editly, localBackend } from "../../ai-sdk/providers/editly"; @@ -129,6 +130,28 @@ export async function renderRoot( }; const wrapGenerateVideo: typeof generateVideo = async (opts) => { + if (mode === "prerender") { + const prerenderImageModel = options.defaults?.prerenderImage; + if (!prerenderImageModel) { + throw new Error( + "prerender mode requires defaults.prerenderImage model (e.g., varg.imageModel('nano-banana-2'))", + ); + } + // Use uncached generateVideo so we bypass any cached Kling/Wan results + // and always run the prerender middleware (which generates still frames). + // Image generation inside the middleware still uses cachedGenerateImage. + return generateVideo({ + ...opts, + model: wrapVideoModel({ + model: opts.model, + middleware: prerenderFallbackMiddleware({ + imageModel: prerenderImageModel, + generateImageFn: cachedGenerateImage, + }), + }), + } as Parameters[0]); + } + if (mode === "preview") { trackPlaceholder("video"); return cachedGenerateVideo({ @@ -630,6 +653,13 @@ export async function renderRoot( ); } + if (!options.quiet && mode === "prerender") { + const modelId = options.defaults?.prerenderImage?.modelId ?? "unknown"; + console.log( + `\x1b[36mℹ prerender mode: videos replaced with still frames (image model: ${modelId})\x1b[0m`, + ); + } + return { video: new Uint8Array(finalBuffer), files: generatedFiles, diff --git a/src/react/types.ts b/src/react/types.ts index 012c11c2..0d7e7036 100644 --- a/src/react/types.ts +++ b/src/react/types.ts @@ -335,7 +335,7 @@ export interface PackshotProps extends BaseProps { duration?: number; } -export type RenderMode = "strict" | "preview"; +export type RenderMode = "strict" | "preview" | "prerender"; export interface DefaultModels { image?: ImageModelV3; @@ -343,6 +343,8 @@ export interface DefaultModels { speech?: SpeechModelV3; music?: MusicModelV3; transcription?: TranscriptionModelV3; + /** Image model used in prerender mode to replace t2v generation (default: nano-banana-2) */ + prerenderImage?: ImageModelV3; } export interface RenderOptions {