Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions src/ai-sdk/generate-video.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ export type GenerateVideoPrompt =
| string
| {
text?: string;
images?: Array<DataContent>;
audio?: DataContent;
video?: DataContent;
images?: DataContent | Array<DataContent>;
audio?: DataContent | Array<DataContent>;
video?: DataContent | Array<DataContent>;
};

export interface GenerateVideoOptions {
Expand Down Expand Up @@ -76,6 +76,12 @@ function toUint8Array(data: DataContent): Uint8Array {
return data;
}

/** Normalize singular or array to array */
function toArray<T>(value: T | T[] | undefined): T[] {
if (value == null) return [];
return Array.isArray(value) ? value : [value];
}

function normalizePrompt(prompt: GenerateVideoPrompt): {
prompt: string | undefined;
files: ImageModelV3File[] | undefined;
Expand All @@ -86,27 +92,27 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {

const files: ImageModelV3File[] = [];

for (const img of prompt.images ?? []) {
for (const img of toArray(prompt.images)) {
files.push({
type: "file",
mediaType: "image/png",
data: toUint8Array(img),
});
}

if (prompt.audio) {
for (const aud of toArray(prompt.audio)) {
files.push({
type: "file",
mediaType: "audio/mpeg",
data: toUint8Array(prompt.audio),
data: toUint8Array(aud),
});
}

if (prompt.video) {
for (const vid of toArray(prompt.video)) {
files.push({
type: "file",
mediaType: "video/mp4",
data: toUint8Array(prompt.video),
data: toUint8Array(vid),
});
}

Expand Down
126 changes: 124 additions & 2 deletions src/ai-sdk/providers/fal.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ const VIDEO_MODELS: Record<string, { t2v: string; i2v: string }> = {
t2v: "fal-ai/kling-video/o3/standard/text-to-video",
i2v: "fal-ai/kling-video/o3/standard/image-to-video",
},
// Kling O3 4K - native 4K output (i2v only, t2v falls back to pro)
"kling-v3-4k": {
t2v: "fal-ai/kling-video/o3/pro/text-to-video",
i2v: "fal-ai/kling-video/o3/4k/image-to-video",
},
// Kling v2.6 - with native audio generation
"kling-v2.6": {
t2v: "fal-ai/kling-video/v2.6/pro/text-to-video",
Expand Down Expand Up @@ -163,8 +168,21 @@ const VIDEO_EDIT_MODELS: Record<string, string> = {
"sora-2-remix": "fal-ai/sora-2/video-to-video/remix",
};

// Reference-to-video models - images/elements + prompt → video with character consistency
const REFERENCE_VIDEO_MODELS: Record<string, string> = {
"kling-v3-ref": "fal-ai/kling-video/o3/pro/reference-to-video",
"kling-v3-4k-ref": "fal-ai/kling-video/o3/4k/reference-to-video",
};

// Video-to-video reference models - reference video + prompt → new video preserving motion/camera
const V2V_REFERENCE_MODELS: Record<string, string> = {
"kling-v3-v2v-ref": "fal-ai/kling-video/o3/standard/video-to-video/reference",
};

// Motion control models - video-to-video with motion transfer
const MOTION_CONTROL_MODELS: Record<string, string> = {
"kling-v3-motion": "fal-ai/kling-video/v3/pro/motion-control",
"kling-v3-motion-standard": "fal-ai/kling-video/v3/standard/motion-control",
"kling-v2.6-motion": "fal-ai/kling-video/v2.6/pro/motion-control",
"kling-v2.6-motion-standard":
"fal-ai/kling-video/v2.6/standard/motion-control",
Expand Down Expand Up @@ -520,8 +538,12 @@ class FalVideoModel implements VideoModelV3 {
const isMotionControl = MOTION_CONTROL_MODELS[this.modelId] !== undefined;
const isVideoEdit = VIDEO_EDIT_MODELS[this.modelId] !== undefined;
const isVideoUpscale = VIDEO_UPSCALE_MODELS[this.modelId] !== undefined;
const isReferenceVideo = REFERENCE_VIDEO_MODELS[this.modelId] !== undefined;
const isV2VReference = V2V_REFERENCE_MODELS[this.modelId] !== undefined;
const isKlingV3 =
this.modelId === "kling-v3" || this.modelId === "kling-v3-standard";
this.modelId === "kling-v3" ||
this.modelId === "kling-v3-standard" ||
this.modelId === "kling-v3-4k";
const isKlingV26 = this.modelId === "kling-v2.6";
const isLtx2 = this.modelId === "ltx-2-19b-distilled";
const isGrokImagine = this.modelId === "grok-imagine";
Expand All @@ -537,7 +559,11 @@ class FalVideoModel implements VideoModelV3 {
? this.resolveVideoEditEndpoint()
: isVideoUpscale
? this.resolveVideoUpscaleEndpoint()
: this.resolveEndpoint(hasImageInput ?? false);
: isReferenceVideo
? this.resolveReferenceVideoEndpoint()
: isV2VReference
? this.resolveV2VReferenceEndpoint()
: this.resolveEndpoint(hasImageInput ?? false);

const input: Record<string, unknown> = {
...(providerOptions?.fal ?? {}),
Expand Down Expand Up @@ -625,6 +651,86 @@ class FalVideoModel implements VideoModelV3 {
if (videoFile) {
input.video_url = await fileToUrl(videoFile);
}
} else if (isReferenceVideo) {
// Reference-to-video: prompt + optional start/end images + reference images
// Elements and multi_prompt are passed via providerOptions.fal
if (prompt) {
input.prompt = prompt;
}

if (files) {
const imageFiles = files.filter((f) =>
getMediaType(f)?.startsWith("image/"),
);
// First image → start_image_url, second → end_image_url
if (imageFiles[0]) {
input.start_image_url = await fileToUrl(imageFiles[0]);
}
if (imageFiles[1]) {
input.end_image_url = await fileToUrl(imageFiles[1]);
}
// Additional images (3+) → image_urls for style/appearance reference
if (imageFiles.length > 2) {
const additionalUrls: string[] = [];
for (let i = 2; i < imageFiles.length; i++) {
additionalUrls.push(await fileToUrl(imageFiles[i]!));
}
input.image_urls = additionalUrls;
}
}

// Duration as string integer for Kling O3
const normalized = normalizeProviderInput(this.modelId, { duration });
input.duration = normalized.duration;

if (!input.aspect_ratio) {
input.aspect_ratio = aspectRatio ?? "16:9";
}

// Default to generating audio
if (input.generate_audio === undefined) {
input.generate_audio = true;
}
} else if (isV2VReference) {
// Video-to-video reference: reference video + prompt → new video preserving motion/camera
// Elements and image_urls are passed via providerOptions.fal
if (prompt) {
input.prompt = prompt;
}

const videoFile = files?.find((f) =>
getMediaType(f)?.startsWith("video/"),
);
if (videoFile) {
input.video_url = await fileToUrl(videoFile);
}

// Reference images from file inputs (for style/appearance)
if (files) {
const imageFiles = files.filter((f) =>
getMediaType(f)?.startsWith("image/"),
);
if (imageFiles.length > 0) {
const imageUrls: string[] = [];
for (const imgFile of imageFiles) {
imageUrls.push(await fileToUrl(imgFile));
}
input.image_urls = imageUrls;
}
}

// Duration as string integer for Kling O3
const normalized = normalizeProviderInput(this.modelId, { duration });
input.duration = normalized.duration;

if (!input.aspect_ratio) {
input.aspect_ratio = aspectRatio ?? "auto";
}

// Default to keeping original audio from reference video
if (input.keep_audio === undefined) {
input.keep_audio = true;
}
} else {
// Standard video generation
input.prompt = prompt;
Expand Down Expand Up @@ -825,6 +931,22 @@ class FalVideoModel implements VideoModelV3 {

return VIDEO_UPSCALE_MODELS[this.modelId] ?? this.modelId;
}

private resolveReferenceVideoEndpoint(): string {
if (this.modelId.startsWith("raw:")) {
return this.modelId.slice(4);
}

return REFERENCE_VIDEO_MODELS[this.modelId] ?? this.modelId;
}

private resolveV2VReferenceEndpoint(): string {
if (this.modelId.startsWith("raw:")) {
return this.modelId.slice(4);
}

return V2V_REFERENCE_MODELS[this.modelId] ?? this.modelId;
}
}

class FalImageModel implements ImageModelV3 {
Expand Down
10 changes: 10 additions & 0 deletions src/ai-sdk/providers/model-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ const ModelDurationRules: Record<string, z.ZodType> = {
"kling-v3": z.object({ duration: stringIntDuration(3, 15, 5) }),
"kling-v3-standard": z.object({ duration: stringIntDuration(3, 15, 5) }),

// Kling O3 4K: same rules as v3
"kling-v3-4k": z.object({ duration: stringIntDuration(3, 15, 5) }),

// Kling O3 reference-to-video: same duration range
"kling-v3-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
"kling-v3-4k-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),

// Kling O3 video-to-video reference: same duration range
"kling-v3-v2v-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),

// Kling v2.6: same rules as v3
"kling-v2.6": z.object({ duration: stringIntDuration(3, 15, 5) }),

Expand Down
4 changes: 4 additions & 0 deletions src/core/registry/resolver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ const ALIASES: Record<string, string> = {
stt: "speech-to-text",
voice: "text-to-speech",

// Video-to-video
v2v: "video-to-video",
vid2vid: "video-to-video",

// Video editing
concat: "merge",
join: "merge",
Expand Down
28 changes: 24 additions & 4 deletions src/definitions/actions/video.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ const videoInputSchema = z.object({
image: filePathSchema
.optional()
.describe("Input image (enables image-to-video)"),
video: filePathSchema
.optional()
.describe("Input video for video-to-video (preserves motion/camera style)"),
duration: videoDurationSchema
.default(5)
.describe("Video duration in seconds"),
Expand All @@ -42,7 +45,7 @@ const schema: ZodSchema<typeof videoInputSchema, typeof videoOutputSchema> = {
export const definition: ActionDefinition<typeof schema> = {
type: "action",
name: "video",
description: "Generate video from text or image",
description: "Generate video from text, image, or video",
schema,
routes: [
{
Expand All @@ -51,12 +54,29 @@ export const definition: ActionDefinition<typeof schema> = {
},
],
execute: async (inputs) => {
// inputs is now fully typed as VideoInput - no more `as` cast!
const { prompt, image, duration, aspectRatio } = inputs;
const { prompt, image, video, duration, aspectRatio } = inputs;

let result: { data?: { video?: { url?: string }; duration?: number } };

if (image) {
if (video && image) {
// Video + image → motion control (transfer motion to character)
console.log(
"[action/video] generating motion control video (image + video)",
);
result = await falProvider.motionControl({
prompt,
imageUrl: image,
videoUrl: video,
});
} else if (video) {
// Video only → video-to-video reference (preserve motion/camera)
console.log("[action/video] generating video-to-video reference");
result = await falProvider.videoToVideoReference({
prompt,
videoUrl: video,
duration,
});
} else if (image) {
console.log("[action/video] generating video from image");
result = await falProvider.imageToVideo({
prompt,
Expand Down
26 changes: 24 additions & 2 deletions src/definitions/models/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@ import type {
export { definition as elevenlabsTts } from "./elevenlabs";
export { definition as flux } from "./flux";
export { definition as heygenAvatar } from "./heygen";
export { definition as kling } from "./kling";
export {
definition as kling,
kling4kDefinition as kling4k,
kling4kRefDefinition as kling4kRef,
klingRefDefinition as klingRef,
klingV2VRefDefinition as klingV2VRef,
klingV3MotionDefinition as klingV3Motion,
klingV3MotionStdDefinition as klingV3MotionStd,
} from "./kling";
export { definition as llama } from "./llama";
export { definition as ltxA2v } from "./ltx-a2v";
export { definition as nanoBanana2 } from "./nano-banana-2";
Expand Down Expand Up @@ -39,7 +47,15 @@ export { definition as whisper } from "./whisper";
import { definition as elevenlabsDefinition } from "./elevenlabs";
import { definition as fluxDefinition } from "./flux";
import { definition as heygenAvatarDefinition } from "./heygen";
import { definition as klingDefinition } from "./kling";
import {
kling4kDefinition,
kling4kRefDefinition,
definition as klingDefinition,
klingRefDefinition,
klingV2VRefDefinition,
klingV3MotionDefinition,
klingV3MotionStdDefinition,
} from "./kling";
import { definition as llamaDefinition } from "./llama";
import { definition as ltxA2vDefinition } from "./ltx-a2v";
import { definition as nanoBanana2Definition } from "./nano-banana-2";
Expand All @@ -65,6 +81,12 @@ import { definition as whisperDefinition } from "./whisper";

export const allModels = [
klingDefinition,
kling4kDefinition,
klingRefDefinition,
kling4kRefDefinition,
klingV2VRefDefinition,
klingV3MotionDefinition,
klingV3MotionStdDefinition,
fluxDefinition,
nanoBananaProDefinition,
nanoBanana2Definition,
Expand Down
Loading
Loading