From d827ef75e4c9d0685920c1e491ed0b3a0491efa1 Mon Sep 17 00:00:00 2001
From: SecurityQQ <alexmsecurity@gmail.com>
Date: Thu, 23 Apr 2026 20:44:28 -0700
Subject: [PATCH] feat: add video-to-video models (Kling O3 reference, 4K, v3
 motion control)

Add 6 new Kling video models with video input support:

- kling-v3-4k: native 4K image-to-video ($0.42/sec)
- kling-v3-ref: O3 Pro reference-to-video with character consistency ($0.112-0.14/sec)
- kling-v3-4k-ref: O3 4K reference-to-video ($0.42/sec)
- kling-v3-v2v-ref: O3 Standard video-to-video reference preserving motion/camera ($0.126/sec)
- kling-v3-motion: V3 Pro motion control - transfer motion to character (up to 30s, $0.168/sec)
- kling-v3-motion-standard: V3 Standard motion control ($0.126/sec)

Also:
- GenerateVideoPrompt now accepts singular or array for images/audio/video fields
- Video action routes video+image to motion control, video-only to v2v reference
- Added v2v/vid2vid resolver aliases
- Added motionControl() and videoToVideoReference() convenience methods to fal provider
---
 src/ai-sdk/generate-video.ts        |  22 +-
 src/ai-sdk/providers/fal.ts         | 126 ++++++++++-
 src/ai-sdk/providers/model-rules.ts |  10 +
 src/core/registry/resolver.ts       |   4 +
 src/definitions/actions/video.ts    |  28 ++-
 src/definitions/models/index.ts     |  26 ++-
 src/definitions/models/kling.ts     | 310 ++++++++++++++++++++++++++++
 src/providers/fal.ts                |  97 +++++++++
 src/react/renderers/progress.ts     |   6 +
 9 files changed, 613 insertions(+), 16 deletions(-)
diff --git a/src/ai-sdk/generate-video.ts b/src/ai-sdk/generate-video.ts
index f26efdb4..0d07bb79 100644
--- a/src/ai-sdk/generate-video.ts
+++ b/src/ai-sdk/generate-video.ts
@@ -10,9 +10,9 @@ export type GenerateVideoPrompt =
   | string
   | {
       text?: string;
-      images?: Array<DataContent>;
-      audio?: DataContent;
-      video?: DataContent;
+      images?: DataContent | Array<DataContent>;
+      audio?: DataContent | Array<DataContent>;
+      video?: DataContent | Array<DataContent>;
     };
 
 export interface GenerateVideoOptions {
@@ -76,6 +76,12 @@ function toUint8Array(data: DataContent): Uint8Array {
   return data;
 }
 
+/** Normalize singular or array to array */
+function toArray<T>(value: T | T[] | undefined): T[] {
+  if (value == null) return [];
+  return Array.isArray(value) ? value : [value];
+}
+
 function normalizePrompt(prompt: GenerateVideoPrompt): {
   prompt: string | undefined;
   files: ImageModelV3File[] | undefined;
@@ -86,7 +92,7 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {
 
   const files: ImageModelV3File[] = [];
 
-  for (const img of prompt.images ?? []) {
+  for (const img of toArray(prompt.images)) {
     files.push({
       type: "file",
       mediaType: "image/png",
@@ -94,19 +100,19 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {
     });
   }
 
-  if (prompt.audio) {
+  for (const aud of toArray(prompt.audio)) {
     files.push({
       type: "file",
       mediaType: "audio/mpeg",
-      data: toUint8Array(prompt.audio),
+      data: toUint8Array(aud),
     });
   }
 
-  if (prompt.video) {
+  for (const vid of toArray(prompt.video)) {
     files.push({
       type: "file",
       mediaType: "video/mp4",
-      data: toUint8Array(prompt.video),
+      data: toUint8Array(vid),
     });
   }
 
diff --git a/src/ai-sdk/providers/fal.ts b/src/ai-sdk/providers/fal.ts
index d58d6811..2206edef 100644
--- a/src/ai-sdk/providers/fal.ts
+++ b/src/ai-sdk/providers/fal.ts
@@ -107,6 +107,11 @@ const VIDEO_MODELS: Record<string, { t2v: string; i2v: string }> = {
     t2v: "fal-ai/kling-video/o3/standard/text-to-video",
     i2v: "fal-ai/kling-video/o3/standard/image-to-video",
   },
+  // Kling O3 4K - native 4K output (i2v only, t2v falls back to pro)
+  "kling-v3-4k": {
+    t2v: "fal-ai/kling-video/o3/pro/text-to-video",
+    i2v: "fal-ai/kling-video/o3/4k/image-to-video",
+  },
   // Kling v2.6 - with native audio generation
   "kling-v2.6": {
     t2v: "fal-ai/kling-video/v2.6/pro/text-to-video",
@@ -163,8 +168,21 @@ const VIDEO_EDIT_MODELS: Record<string, string> = {
   "sora-2-remix": "fal-ai/sora-2/video-to-video/remix",
 };
 
+// Reference-to-video models - images/elements + prompt → video with character consistency
+const REFERENCE_VIDEO_MODELS: Record<string, string> = {
+  "kling-v3-ref": "fal-ai/kling-video/o3/pro/reference-to-video",
+  "kling-v3-4k-ref": "fal-ai/kling-video/o3/4k/reference-to-video",
+};
+
+// Video-to-video reference models - reference video + prompt → new video preserving motion/camera
+const V2V_REFERENCE_MODELS: Record<string, string> = {
+  "kling-v3-v2v-ref": "fal-ai/kling-video/o3/standard/video-to-video/reference",
+};
+
 // Motion control models - video-to-video with motion transfer
 const MOTION_CONTROL_MODELS: Record<string, string> = {
+  "kling-v3-motion": "fal-ai/kling-video/v3/pro/motion-control",
+  "kling-v3-motion-standard": "fal-ai/kling-video/v3/standard/motion-control",
   "kling-v2.6-motion": "fal-ai/kling-video/v2.6/pro/motion-control",
   "kling-v2.6-motion-standard":
     "fal-ai/kling-video/v2.6/standard/motion-control",
@@ -520,8 +538,12 @@ class FalVideoModel implements VideoModelV3 {
     const isMotionControl = MOTION_CONTROL_MODELS[this.modelId] !== undefined;
     const isVideoEdit = VIDEO_EDIT_MODELS[this.modelId] !== undefined;
     const isVideoUpscale = VIDEO_UPSCALE_MODELS[this.modelId] !== undefined;
+    const isReferenceVideo = REFERENCE_VIDEO_MODELS[this.modelId] !== undefined;
+    const isV2VReference = V2V_REFERENCE_MODELS[this.modelId] !== undefined;
     const isKlingV3 =
-      this.modelId === "kling-v3" || this.modelId === "kling-v3-standard";
+      this.modelId === "kling-v3" ||
+      this.modelId === "kling-v3-standard" ||
+      this.modelId === "kling-v3-4k";
     const isKlingV26 = this.modelId === "kling-v2.6";
     const isLtx2 = this.modelId === "ltx-2-19b-distilled";
     const isGrokImagine = this.modelId === "grok-imagine";
@@ -537,7 +559,11 @@ class FalVideoModel implements VideoModelV3 {
           ? this.resolveVideoEditEndpoint()
           : isVideoUpscale
             ? this.resolveVideoUpscaleEndpoint()
-            : this.resolveEndpoint(hasImageInput ?? false);
+            : isReferenceVideo
+              ? this.resolveReferenceVideoEndpoint()
+              : isV2VReference
+                ? this.resolveV2VReferenceEndpoint()
+                : this.resolveEndpoint(hasImageInput ?? false);
 
     const input: Record<string, unknown> = {
       ...(providerOptions?.fal ?? {}),
@@ -625,6 +651,86 @@ class FalVideoModel implements VideoModelV3 {
       if (videoFile) {
         input.video_url = await fileToUrl(videoFile);
       }
+    } else if (isReferenceVideo) {
+      // Reference-to-video: prompt + optional start/end images + reference images
+      // Elements and multi_prompt are passed via providerOptions.fal
+      if (prompt) {
+        input.prompt = prompt;
+      }
+
+      if (files) {
+        const imageFiles = files.filter((f) =>
+          getMediaType(f)?.startsWith("image/"),
+        );
+        // First image → start_image_url, second → end_image_url
+        if (imageFiles[0]) {
+          input.start_image_url = await fileToUrl(imageFiles[0]);
+        }
+        if (imageFiles[1]) {
+          input.end_image_url = await fileToUrl(imageFiles[1]);
+        }
+        // Additional images (3+) → image_urls for style/appearance reference
+        if (imageFiles.length > 2) {
+          const additionalUrls: string[] = [];
+          for (let i = 2; i < imageFiles.length; i++) {
+            additionalUrls.push(await fileToUrl(imageFiles[i]!));
+          }
+          input.image_urls = additionalUrls;
+        }
+      }
+
+      // Duration as string integer for Kling O3
+      const normalized = normalizeProviderInput(this.modelId, { duration });
+      input.duration = normalized.duration;
+
+      if (!input.aspect_ratio) {
+        input.aspect_ratio = aspectRatio ?? "16:9";
+      }
+
+      // Default to generating audio
+      if (input.generate_audio === undefined) {
+        input.generate_audio = true;
+      }
+    } else if (isV2VReference) {
+      // Video-to-video reference: reference video + prompt → new video preserving motion/camera
+      // Elements and image_urls are passed via providerOptions.fal
+      if (prompt) {
+        input.prompt = prompt;
+      }
+
+      const videoFile = files?.find((f) =>
+        getMediaType(f)?.startsWith("video/"),
+      );
+      if (videoFile) {
+        input.video_url = await fileToUrl(videoFile);
+      }
+
+      // Reference images from file inputs (for style/appearance)
+      if (files) {
+        const imageFiles = files.filter((f) =>
+          getMediaType(f)?.startsWith("image/"),
+        );
+        if (imageFiles.length > 0) {
+          const imageUrls: string[] = [];
+          for (const imgFile of imageFiles) {
+            imageUrls.push(await fileToUrl(imgFile));
+          }
+          input.image_urls = imageUrls;
+        }
+      }
+
+      // Duration as string integer for Kling O3
+      const normalized = normalizeProviderInput(this.modelId, { duration });
+      input.duration = normalized.duration;
+
+      if (!input.aspect_ratio) {
+        input.aspect_ratio = aspectRatio ?? "auto";
+      }
+
+      // Default to keeping original audio from reference video
+      if (input.keep_audio === undefined) {
+        input.keep_audio = true;
+      }
     } else {
       // Standard video generation
       input.prompt = prompt;
@@ -825,6 +931,22 @@ class FalVideoModel implements VideoModelV3 {
 
     return VIDEO_UPSCALE_MODELS[this.modelId] ?? this.modelId;
   }
+
+  private resolveReferenceVideoEndpoint(): string {
+    if (this.modelId.startsWith("raw:")) {
+      return this.modelId.slice(4);
+    }
+
+    return REFERENCE_VIDEO_MODELS[this.modelId] ?? this.modelId;
+  }
+
+  private resolveV2VReferenceEndpoint(): string {
+    if (this.modelId.startsWith("raw:")) {
+      return this.modelId.slice(4);
+    }
+
+    return V2V_REFERENCE_MODELS[this.modelId] ?? this.modelId;
+  }
 }
 
 class FalImageModel implements ImageModelV3 {
diff --git a/src/ai-sdk/providers/model-rules.ts b/src/ai-sdk/providers/model-rules.ts
index 21e3f5ff..7f049cc5 100644
--- a/src/ai-sdk/providers/model-rules.ts
+++ b/src/ai-sdk/providers/model-rules.ts
@@ -70,6 +70,16 @@ const ModelDurationRules: Record<string, z.ZodType> = {
   "kling-v3": z.object({ duration: stringIntDuration(3, 15, 5) }),
   "kling-v3-standard": z.object({ duration: stringIntDuration(3, 15, 5) }),
 
+  // Kling O3 4K: same rules as v3
+  "kling-v3-4k": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
+  // Kling O3 reference-to-video: same duration range
+  "kling-v3-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+  "kling-v3-4k-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
+  // Kling O3 video-to-video reference: same duration range
+  "kling-v3-v2v-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
   // Kling v2.6: same rules as v3
   "kling-v2.6": z.object({ duration: stringIntDuration(3, 15, 5) }),
 
diff --git a/src/core/registry/resolver.ts b/src/core/registry/resolver.ts
index d57db3fa..cc94ea07 100644
--- a/src/core/registry/resolver.ts
+++ b/src/core/registry/resolver.ts
@@ -40,6 +40,10 @@ const ALIASES: Record<string, string> = {
   stt: "speech-to-text",
   voice: "text-to-speech",
 
+  // Video-to-video
+  v2v: "video-to-video",
+  vid2vid: "video-to-video",
+
   // Video editing
   concat: "merge",
   join: "merge",
diff --git a/src/definitions/actions/video.ts b/src/definitions/actions/video.ts
index d34e0db0..0b73bbb0 100644
--- a/src/definitions/actions/video.ts
+++ b/src/definitions/actions/video.ts
@@ -19,6 +19,9 @@ const videoInputSchema = z.object({
   image: filePathSchema
     .optional()
     .describe("Input image (enables image-to-video)"),
+  video: filePathSchema
+    .optional()
+    .describe("Input video for video-to-video (preserves motion/camera style)"),
   duration: videoDurationSchema
     .default(5)
     .describe("Video duration in seconds"),
@@ -42,7 +45,7 @@ const schema: ZodSchema<typeof videoInputSchema, typeof videoOutputSchema> = {
 export const definition: ActionDefinition<typeof schema> = {
   type: "action",
   name: "video",
-  description: "Generate video from text or image",
+  description: "Generate video from text, image, or video",
   schema,
   routes: [
     {
@@ -51,12 +54,29 @@ export const definition: ActionDefinition<typeof schema> = {
     },
   ],
   execute: async (inputs) => {
-    // inputs is now fully typed as VideoInput - no more `as` cast!
-    const { prompt, image, duration, aspectRatio } = inputs;
+    const { prompt, image, video, duration, aspectRatio } = inputs;
 
     let result: { data?: { video?: { url?: string }; duration?: number } };
 
-    if (image) {
+    if (video && image) {
+      // Video + image → motion control (transfer motion to character)
+      console.log(
+        "[action/video] generating motion control video (image + video)",
+      );
+      result = await falProvider.motionControl({
+        prompt,
+        imageUrl: image,
+        videoUrl: video,
+      });
+    } else if (video) {
+      // Video only → video-to-video reference (preserve motion/camera)
+      console.log("[action/video] generating video-to-video reference");
+      result = await falProvider.videoToVideoReference({
+        prompt,
+        videoUrl: video,
+        duration,
+      });
+    } else if (image) {
       console.log("[action/video] generating video from image");
       result = await falProvider.imageToVideo({
         prompt,
diff --git a/src/definitions/models/index.ts b/src/definitions/models/index.ts
index 182fd6ad..dd9c870a 100644
--- a/src/definitions/models/index.ts
+++ b/src/definitions/models/index.ts
@@ -11,7 +11,15 @@ import type {
 export { definition as elevenlabsTts } from "./elevenlabs";
 export { definition as flux } from "./flux";
 export { definition as heygenAvatar } from "./heygen";
-export { definition as kling } from "./kling";
+export {
+  definition as kling,
+  kling4kDefinition as kling4k,
+  kling4kRefDefinition as kling4kRef,
+  klingRefDefinition as klingRef,
+  klingV2VRefDefinition as klingV2VRef,
+  klingV3MotionDefinition as klingV3Motion,
+  klingV3MotionStdDefinition as klingV3MotionStd,
+} from "./kling";
 export { definition as llama } from "./llama";
 export { definition as ltxA2v } from "./ltx-a2v";
 export { definition as nanoBanana2 } from "./nano-banana-2";
@@ -39,7 +47,15 @@ export { definition as whisper } from "./whisper";
 import { definition as elevenlabsDefinition } from "./elevenlabs";
 import { definition as fluxDefinition } from "./flux";
 import { definition as heygenAvatarDefinition } from "./heygen";
-import { definition as klingDefinition } from "./kling";
+import {
+  kling4kDefinition,
+  kling4kRefDefinition,
+  definition as klingDefinition,
+  klingRefDefinition,
+  klingV2VRefDefinition,
+  klingV3MotionDefinition,
+  klingV3MotionStdDefinition,
+} from "./kling";
 import { definition as llamaDefinition } from "./llama";
 import { definition as ltxA2vDefinition } from "./ltx-a2v";
 import { definition as nanoBanana2Definition } from "./nano-banana-2";
@@ -65,6 +81,12 @@ import { definition as whisperDefinition } from "./whisper";
 
 export const allModels = [
   klingDefinition,
+  kling4kDefinition,
+  klingRefDefinition,
+  kling4kRefDefinition,
+  klingV2VRefDefinition,
+  klingV3MotionDefinition,
+  klingV3MotionStdDefinition,
   fluxDefinition,
   nanoBananaProDefinition,
   nanoBanana2Definition,
diff --git a/src/definitions/models/kling.ts b/src/definitions/models/kling.ts
index c8154821..d2facd47 100644
--- a/src/definitions/models/kling.ts
+++ b/src/definitions/models/kling.ts
@@ -68,4 +68,314 @@ export const definition: ModelDefinition<typeof schema> = {
   },
 };
 
+// ---------------------------------------------------------------------------
+// Kling O3 4K — native 4K output (image-to-video)
+// ---------------------------------------------------------------------------
+
+const kling4kInputSchema = z.object({
+  prompt: z.string().describe("Text description guiding the video generation"),
+  image_url: z.string().url().describe("URL of the start frame image"),
+  end_image_url: z
+    .string()
+    .url()
+    .optional()
+    .describe("URL of the end frame image (optional)"),
+  duration: z
+    .number()
+    .int()
+    .min(3)
+    .max(15)
+    .default(5)
+    .describe("Video duration in seconds (3–15)"),
+  generate_audio: z
+    .boolean()
+    .default(false)
+    .describe("Whether to generate native audio"),
+});
+
+const kling4kSchema: ZodSchema<
+  typeof kling4kInputSchema,
+  typeof klingOutputSchema
+> = {
+  input: kling4kInputSchema,
+  output: klingOutputSchema,
+};
+
+export const kling4kDefinition: ModelDefinition<typeof kling4kSchema> = {
+  type: "model",
+  name: "kling-v3-4k",
+  description:
+    "Kling O3 4K — native 4K video output from image, no upscaling needed",
+  providers: ["fal"],
+  defaultProvider: "fal",
+  providerModels: {
+    fal: "fal-ai/kling-video/o3/4k/image-to-video",
+  },
+  schema: kling4kSchema,
+  pricing: {
+    fal: {
+      description: "Kling O3 4K: $0.42/sec regardless of audio",
+      calculate: ({ duration = 5 }) => 0.42 * duration,
+      minUsd: 1.26, // 3s
+      maxUsd: 6.3, // 15s
+    },
+  },
+};
+
+// ---------------------------------------------------------------------------
+// Kling O3 Reference-to-Video (Pro) — images/elements → video with consistency
+// ---------------------------------------------------------------------------
+
+const klingRefInputSchema = z.object({
+  prompt: z
+    .string()
+    .optional()
+    .describe(
+      "Text prompt for video generation. Reference elements as @Element1, @Element2",
+    ),
+  start_image_url: z
+    .string()
+    .url()
+    .optional()
+    .describe("Image to use as the first frame"),
+  end_image_url: z
+    .string()
+    .url()
+    .optional()
+    .describe("Image to use as the last frame"),
+  image_urls: z
+    .array(z.string().url())
+    .max(7)
+    .optional()
+    .describe(
+      "Reference images for style/appearance. Reference in prompt as @Image1, @Image2. Max 7 total (elements + images)",
+    ),
+  duration: z
+    .number()
+    .int()
+    .min(3)
+    .max(15)
+    .default(5)
+    .describe("Video duration in seconds (3–15)"),
+  aspect_ratio: aspectRatioSchema
+    .default("16:9")
+    .describe("Output aspect ratio"),
+  generate_audio: z
+    .boolean()
+    .default(false)
+    .describe("Whether to generate native audio"),
+});
+
+const klingRefSchema: ZodSchema<
+  typeof klingRefInputSchema,
+  typeof klingOutputSchema
+> = {
+  input: klingRefInputSchema,
+  output: klingOutputSchema,
+};
+
+export const klingRefDefinition: ModelDefinition<typeof klingRefSchema> = {
+  type: "model",
+  name: "kling-v3-ref",
+  description:
+    "Kling O3 Pro reference-to-video — generate video with character/object consistency from reference images and elements",
+  providers: ["fal"],
+  defaultProvider: "fal",
+  providerModels: {
+    fal: "fal-ai/kling-video/o3/pro/reference-to-video",
+  },
+  schema: klingRefSchema,
+  pricing: {
+    fal: {
+      description:
+        "Kling O3 Pro ref: $0.112/sec (audio off), $0.14/sec (audio on)",
+      calculate: ({ duration = 5, generateAudio = false }) => {
+        const rate = generateAudio ? 0.14 : 0.112;
+        return rate * duration;
+      },
+      minUsd: 0.336, // 3s * $0.112
+      maxUsd: 2.1, // 15s * $0.14
+    },
+  },
+};
+
+// ---------------------------------------------------------------------------
+// Kling O3 4K Reference-to-Video — 4K reference-to-video
+// ---------------------------------------------------------------------------
+
+export const kling4kRefDefinition: ModelDefinition<typeof klingRefSchema> = {
+  type: "model",
+  name: "kling-v3-4k-ref",
+  description:
+    "Kling O3 4K reference-to-video — native 4K video with character/object consistency from reference images and elements",
+  providers: ["fal"],
+  defaultProvider: "fal",
+  providerModels: {
+    fal: "fal-ai/kling-video/o3/4k/reference-to-video",
+  },
+  schema: klingRefSchema,
+  pricing: {
+    fal: {
+      description: "Kling O3 4K ref: $0.42/sec regardless of audio",
+      calculate: ({ duration = 5 }) => 0.42 * duration,
+      minUsd: 1.26, // 3s
+      maxUsd: 6.3, // 15s
+    },
+  },
+};
+
+// ---------------------------------------------------------------------------
+// Kling O3 Video-to-Video Reference (Standard) — v2v preserving motion/camera
+// ---------------------------------------------------------------------------
+
+const klingV2VRefInputSchema = z.object({
+  prompt: z
+    .string()
+    .describe(
+      "Text prompt for video generation. Reference video as @Video1, elements as @Element1",
+    ),
+  video_url: z
+    .string()
+    .url()
+    .describe(
+      "Reference video URL. Only .mp4/.mov, 3–10s duration, 720–2160px resolution, max 200MB",
+    ),
+  image_urls: z
+    .array(z.string().url())
+    .max(4)
+    .optional()
+    .describe(
+      "Reference images for style/appearance. Max 4 total (elements + images) when using video",
+    ),
+  keep_audio: z
+    .boolean()
+    .default(true)
+    .describe("Whether to keep the original audio from the reference video"),
+  duration: z
+    .number()
+    .int()
+    .min(3)
+    .max(15)
+    .optional()
+    .describe("Video duration in seconds (3–15)"),
+  aspect_ratio: z
+    .enum(["auto", "16:9", "9:16", "1:1"])
+    .default("auto")
+    .describe("Output aspect ratio"),
+});
+
+const klingV2VRefSchema: ZodSchema<
+  typeof klingV2VRefInputSchema,
+  typeof klingOutputSchema
+> = {
+  input: klingV2VRefInputSchema,
+  output: klingOutputSchema,
+};
+
+export const klingV2VRefDefinition: ModelDefinition<typeof klingV2VRefSchema> =
+  {
+    type: "model",
+    name: "kling-v3-v2v-ref",
+    description:
+      "Kling O3 video-to-video reference — generate new video guided by reference video, preserving motion and camera style",
+    providers: ["fal"],
+    defaultProvider: "fal",
+    providerModels: {
+      fal: "fal-ai/kling-video/o3/standard/video-to-video/reference",
+    },
+    schema: klingV2VRefSchema,
+    pricing: {
+      fal: {
+        description: "Kling O3 Standard v2v ref: $0.126/sec",
+        calculate: ({ duration = 5 }) => 0.126 * duration,
+        minUsd: 0.378, // 3s
+        maxUsd: 1.89, // 15s
+      },
+    },
+  };
+
+// ---------------------------------------------------------------------------
+// Kling V3 Motion Control — transfer motion from reference video to character image
+// ---------------------------------------------------------------------------
+
+const klingV3MotionInputSchema = z.object({
+  prompt: z.string().optional().describe("Text prompt for video generation"),
+  image_url: z
+    .string()
+    .url()
+    .describe(
+      "Reference image URL. Characters should have clear body proportions, avoid occlusion, and occupy >5% of image area",
+    ),
+  video_url: z
+    .string()
+    .url()
+    .describe(
+      "Reference video URL for motion transfer. Max 10s with character_orientation 'image', max 30s with 'video'",
+    ),
+  character_orientation: z
+    .enum(["image", "video"])
+    .default("video")
+    .describe(
+      "'video': orientation matches reference video, better for complex motions (max 30s). 'image': orientation matches reference image, better for camera movements (max 10s)",
+    ),
+  keep_original_sound: z
+    .boolean()
+    .default(true)
+    .describe("Whether to keep the original sound from the reference video"),
+});
+
+const klingV3MotionSchema: ZodSchema<
+  typeof klingV3MotionInputSchema,
+  typeof klingOutputSchema
+> = {
+  input: klingV3MotionInputSchema,
+  output: klingOutputSchema,
+};
+
+export const klingV3MotionDefinition: ModelDefinition<
+  typeof klingV3MotionSchema
+> = {
+  type: "model",
+  name: "kling-v3-motion",
+  description:
+    "Kling V3 Pro motion control — transfer movements from reference video to any character image. Supports up to 30s with video orientation",
+  providers: ["fal"],
+  defaultProvider: "fal",
+  providerModels: {
+    fal: "fal-ai/kling-video/v3/pro/motion-control",
+  },
+  schema: klingV3MotionSchema,
+  pricing: {
+    fal: {
+      description: "Kling V3 Pro motion control: $0.168/sec",
+      calculate: ({ inputDuration = 5 }) => 0.168 * inputDuration,
+      minUsd: 0.504, // 3s
+      maxUsd: 5.04, // 30s
+    },
+  },
+};
+
+export const klingV3MotionStdDefinition: ModelDefinition<
+  typeof klingV3MotionSchema
+> = {
+  type: "model",
+  name: "kling-v3-motion-standard",
+  description:
+    "Kling V3 Standard motion control — cost-effective motion transfer from reference video to character image. Supports up to 30s with video orientation",
+  providers: ["fal"],
+  defaultProvider: "fal",
+  providerModels: {
+    fal: "fal-ai/kling-video/v3/standard/motion-control",
+  },
+  schema: klingV3MotionSchema,
+  pricing: {
+    fal: {
+      description: "Kling V3 Standard motion control: $0.126/sec",
+      calculate: ({ inputDuration = 5 }) => 0.126 * inputDuration,
+      minUsd: 0.378, // 3s
+      maxUsd: 3.78, // 30s
+    },
+  },
+};
+
 export default definition;
diff --git a/src/providers/fal.ts b/src/providers/fal.ts
index a1a99286..79c824e0 100644
--- a/src/providers/fal.ts
+++ b/src/providers/fal.ts
@@ -632,6 +632,97 @@ export class FalProvider extends BaseProvider {
     return result;
   }
 
+  // ============================================================================
+  // Video-to-Video
+  // ============================================================================
+
+  /**
+   * Motion control — transfer motion from reference video to character image
+   * Uses Kling V3 Pro motion control endpoint
+   */
+  async motionControl(args: {
+    prompt: string;
+    imageUrl: string;
+    videoUrl: string;
+    characterOrientation?: "image" | "video";
+    keepOriginalSound?: boolean;
+  }) {
+    const modelId = "fal-ai/kling-video/v3/pro/motion-control";
+
+    console.log(`[fal] starting motion control: ${modelId}`);
+    console.log(`[fal] prompt: ${args.prompt}`);
+
+    const imageUrl = await ensureUrl(args.imageUrl, (buffer) =>
+      this.uploadFile(buffer),
+    );
+    const videoUrl = await ensureUrl(args.videoUrl, (buffer) =>
+      this.uploadFile(buffer),
+    );
+
+    const result = await fal.subscribe(modelId, {
+      input: {
+        prompt: args.prompt,
+        image_url: imageUrl,
+        video_url: videoUrl,
+        character_orientation: args.characterOrientation ?? "video",
+        keep_original_sound: args.keepOriginalSound ?? true,
+      },
+      logs: true,
+      onQueueUpdate: (update) => {
+        if (update.status === "IN_PROGRESS") {
+          console.log(
+            `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
+          );
+        }
+      },
+    });
+
+    console.log("[fal] completed!");
+    return result;
+  }
+
+  /**
+   * Video-to-video reference — generate new video guided by reference video,
+   * preserving motion and camera style (Kling O3 Standard)
+   */
+  async videoToVideoReference(args: {
+    prompt: string;
+    videoUrl: string;
+    duration?: number;
+    aspectRatio?: "auto" | "16:9" | "9:16" | "1:1";
+    keepAudio?: boolean;
+  }) {
+    const modelId = "fal-ai/kling-video/o3/standard/video-to-video/reference";
+
+    console.log(`[fal] starting v2v reference: ${modelId}`);
+    console.log(`[fal] prompt: ${args.prompt}`);
+
+    const videoUrl = await ensureUrl(args.videoUrl, (buffer) =>
+      this.uploadFile(buffer),
+    );
+
+    const result = await fal.subscribe(modelId, {
+      input: {
+        prompt: args.prompt,
+        video_url: videoUrl,
+        duration: args.duration ? String(args.duration) : undefined,
+        aspect_ratio: args.aspectRatio ?? "auto",
+        keep_audio: args.keepAudio ?? true,
+      },
+      logs: true,
+      onQueueUpdate: (update) => {
+        if (update.status === "IN_PROGRESS") {
+          console.log(
+            `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
+          );
+        }
+      },
+    });
+
+    console.log("[fal] completed!");
+    return result;
+  }
+
   /**
    * Edit video using Grok Imagine Video
    * Video will be resized to max 854x480 and truncated to 8 seconds
@@ -761,3 +852,9 @@ export const ltx2AudioToVideo = (
 ) => falProvider.ltx2AudioToVideo(args);
 export const textToMusic = (args: Parameters<FalProvider["textToMusic"]>[0]) =>
   falProvider.textToMusic(args);
+export const motionControl = (
+  args: Parameters<FalProvider["motionControl"]>[0],
+) => falProvider.motionControl(args);
+export const videoToVideoReference = (
+  args: Parameters<FalProvider["videoToVideoReference"]>[0],
+) => falProvider.videoToVideoReference(args);
diff --git a/src/react/renderers/progress.ts b/src/react/renderers/progress.ts
index be4a0ac4..60af7e4d 100644
--- a/src/react/renderers/progress.ts
+++ b/src/react/renderers/progress.ts
@@ -31,6 +31,12 @@ export const MODEL_TIME_ESTIMATES: Record<string, number> = {
   "kling-v2": 180,
   "kling-v2.5": 180,
   "kling-v2.6": 180,
+  "kling-v3-4k": 240,
+  "kling-v3-ref": 180,
+  "kling-v3-4k-ref": 240,
+  "kling-v3-v2v-ref": 180,
+  "kling-v3-motion": 300,
+  "kling-v3-motion-standard": 240,
   "kling-v2.6-motion": 240,
   "kling-v2.6-motion-standard": 180,
   minimax: 90,