vargHQ · SecurityQQ · Apr 24, 2026 · Apr 24, 2026
diff --git a/src/ai-sdk/generate-video.ts b/src/ai-sdk/generate-video.ts
@@ -10,9 +10,9 @@ export type GenerateVideoPrompt =
   | string
   | {
       text?: string;
-      images?: Array<DataContent>;
-      audio?: DataContent;
-      video?: DataContent;
+      images?: DataContent | Array<DataContent>;
+      audio?: DataContent | Array<DataContent>;
+      video?: DataContent | Array<DataContent>;
     };
 
 export interface GenerateVideoOptions {
@@ -76,6 +76,12 @@ function toUint8Array(data: DataContent): Uint8Array {
   return data;
 }
 
+/** Normalize singular or array to array */
+function toArray<T>(value: T | T[] | undefined): T[] {
+  if (value == null) return [];
+  return Array.isArray(value) ? value : [value];
+}
+
 function normalizePrompt(prompt: GenerateVideoPrompt): {
   prompt: string | undefined;
   files: ImageModelV3File[] | undefined;
@@ -86,27 +92,27 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {
 
   const files: ImageModelV3File[] = [];
 
-  for (const img of prompt.images ?? []) {
+  for (const img of toArray(prompt.images)) {
     files.push({
       type: "file",
       mediaType: "image/png",
       data: toUint8Array(img),
     });
   }
 
-  if (prompt.audio) {
+  for (const aud of toArray(prompt.audio)) {
     files.push({
       type: "file",
       mediaType: "audio/mpeg",
-      data: toUint8Array(prompt.audio),
+      data: toUint8Array(aud),
     });
   }
 
-  if (prompt.video) {
+  for (const vid of toArray(prompt.video)) {
     files.push({
       type: "file",
       mediaType: "video/mp4",
-      data: toUint8Array(prompt.video),
+      data: toUint8Array(vid),
     });
   }
 

diff --git a/src/ai-sdk/providers/fal.ts b/src/ai-sdk/providers/fal.ts
@@ -107,6 +107,11 @@ const VIDEO_MODELS: Record<string, { t2v: string; i2v: string }> = {
     t2v: "fal-ai/kling-video/o3/standard/text-to-video",
     i2v: "fal-ai/kling-video/o3/standard/image-to-video",
   },
+  // Kling O3 4K - native 4K output (i2v only, t2v falls back to pro)
+  "kling-v3-4k": {
+    t2v: "fal-ai/kling-video/o3/pro/text-to-video",
+    i2v: "fal-ai/kling-video/o3/4k/image-to-video",
+  },
   // Kling v2.6 - with native audio generation
   "kling-v2.6": {
     t2v: "fal-ai/kling-video/v2.6/pro/text-to-video",
@@ -163,8 +168,21 @@ const VIDEO_EDIT_MODELS: Record<string, string> = {
   "sora-2-remix": "fal-ai/sora-2/video-to-video/remix",
 };
 
+// Reference-to-video models - images/elements + prompt → video with character consistency
+const REFERENCE_VIDEO_MODELS: Record<string, string> = {
+  "kling-v3-ref": "fal-ai/kling-video/o3/pro/reference-to-video",
+  "kling-v3-4k-ref": "fal-ai/kling-video/o3/4k/reference-to-video",
+};
+
+// Video-to-video reference models - reference video + prompt → new video preserving motion/camera
+const V2V_REFERENCE_MODELS: Record<string, string> = {
+  "kling-v3-v2v-ref": "fal-ai/kling-video/o3/standard/video-to-video/reference",
+};
+
 // Motion control models - video-to-video with motion transfer
 const MOTION_CONTROL_MODELS: Record<string, string> = {
+  "kling-v3-motion": "fal-ai/kling-video/v3/pro/motion-control",
+  "kling-v3-motion-standard": "fal-ai/kling-video/v3/standard/motion-control",
   "kling-v2.6-motion": "fal-ai/kling-video/v2.6/pro/motion-control",
   "kling-v2.6-motion-standard":
     "fal-ai/kling-video/v2.6/standard/motion-control",
@@ -520,8 +538,12 @@ class FalVideoModel implements VideoModelV3 {
     const isMotionControl = MOTION_CONTROL_MODELS[this.modelId] !== undefined;
     const isVideoEdit = VIDEO_EDIT_MODELS[this.modelId] !== undefined;
     const isVideoUpscale = VIDEO_UPSCALE_MODELS[this.modelId] !== undefined;
+    const isReferenceVideo = REFERENCE_VIDEO_MODELS[this.modelId] !== undefined;
+    const isV2VReference = V2V_REFERENCE_MODELS[this.modelId] !== undefined;
     const isKlingV3 =
-      this.modelId === "kling-v3" || this.modelId === "kling-v3-standard";
+      this.modelId === "kling-v3" ||
+      this.modelId === "kling-v3-standard" ||
+      this.modelId === "kling-v3-4k";
     const isKlingV26 = this.modelId === "kling-v2.6";
     const isLtx2 = this.modelId === "ltx-2-19b-distilled";
     const isGrokImagine = this.modelId === "grok-imagine";
@@ -537,7 +559,11 @@ class FalVideoModel implements VideoModelV3 {
           ? this.resolveVideoEditEndpoint()
           : isVideoUpscale
             ? this.resolveVideoUpscaleEndpoint()
-            : this.resolveEndpoint(hasImageInput ?? false);
+            : isReferenceVideo
+              ? this.resolveReferenceVideoEndpoint()
+              : isV2VReference
+                ? this.resolveV2VReferenceEndpoint()
+                : this.resolveEndpoint(hasImageInput ?? false);
 
     const input: Record<string, unknown> = {
       ...(providerOptions?.fal ?? {}),
@@ -625,6 +651,86 @@ class FalVideoModel implements VideoModelV3 {
       if (videoFile) {
         input.video_url = await fileToUrl(videoFile);
       }
+    } else if (isReferenceVideo) {
+      // Reference-to-video: prompt + optional start/end images + reference images
+      // Elements and multi_prompt are passed via providerOptions.fal
+      if (prompt) {
+        input.prompt = prompt;
+      }
+
+      if (files) {
+        const imageFiles = files.filter((f) =>
+          getMediaType(f)?.startsWith("image/"),
+        );
+        // First image → start_image_url, second → end_image_url
+        if (imageFiles[0]) {
+          input.start_image_url = await fileToUrl(imageFiles[0]);
+        }
+        if (imageFiles[1]) {
+          input.end_image_url = await fileToUrl(imageFiles[1]);
+        }
+        // Additional images (3+) → image_urls for style/appearance reference
+        if (imageFiles.length > 2) {
+          const additionalUrls: string[] = [];
+          for (let i = 2; i < imageFiles.length; i++) {
+            additionalUrls.push(await fileToUrl(imageFiles[i]!));
+          }
+          input.image_urls = additionalUrls;
+        }
+      }
+
+      // Duration as string integer for Kling O3
+      const normalized = normalizeProviderInput(this.modelId, { duration });
+      input.duration = normalized.duration;
+
+      if (!input.aspect_ratio) {
+        input.aspect_ratio = aspectRatio ?? "16:9";
+      }
+
+      // Default to generating audio
+      if (input.generate_audio === undefined) {
+        input.generate_audio = true;
+      }
+    } else if (isV2VReference) {
+      // Video-to-video reference: reference video + prompt → new video preserving motion/camera
+      // Elements and image_urls are passed via providerOptions.fal
+      if (prompt) {
+        input.prompt = prompt;
+      }
+
+      const videoFile = files?.find((f) =>
+        getMediaType(f)?.startsWith("video/"),
+      );
+      if (videoFile) {
+        input.video_url = await fileToUrl(videoFile);
+      }
+
+      // Reference images from file inputs (for style/appearance)
+      if (files) {
+        const imageFiles = files.filter((f) =>
+          getMediaType(f)?.startsWith("image/"),
+        );
+        if (imageFiles.length > 0) {
+          const imageUrls: string[] = [];
+          for (const imgFile of imageFiles) {
+            imageUrls.push(await fileToUrl(imgFile));
+          }
+          input.image_urls = imageUrls;
+        }
+      }
+
+      // Duration as string integer for Kling O3
+      const normalized = normalizeProviderInput(this.modelId, { duration });
+      input.duration = normalized.duration;
+
+      if (!input.aspect_ratio) {
+        input.aspect_ratio = aspectRatio ?? "auto";
+      }
+
+      // Default to keeping original audio from reference video
+      if (input.keep_audio === undefined) {
+        input.keep_audio = true;
+      }
     } else {
       // Standard video generation
       input.prompt = prompt;
@@ -825,6 +931,22 @@ class FalVideoModel implements VideoModelV3 {
 
     return VIDEO_UPSCALE_MODELS[this.modelId] ?? this.modelId;
   }
+
+  private resolveReferenceVideoEndpoint(): string {
+    if (this.modelId.startsWith("raw:")) {
+      return this.modelId.slice(4);
+    }
+
+    return REFERENCE_VIDEO_MODELS[this.modelId] ?? this.modelId;
+  }
+
+  private resolveV2VReferenceEndpoint(): string {
+    if (this.modelId.startsWith("raw:")) {
+      return this.modelId.slice(4);
+    }
+
+    return V2V_REFERENCE_MODELS[this.modelId] ?? this.modelId;
+  }
 }
 
 class FalImageModel implements ImageModelV3 {

diff --git a/src/ai-sdk/providers/model-rules.ts b/src/ai-sdk/providers/model-rules.ts
@@ -70,6 +70,16 @@ const ModelDurationRules: Record<string, z.ZodType> = {
   "kling-v3": z.object({ duration: stringIntDuration(3, 15, 5) }),
   "kling-v3-standard": z.object({ duration: stringIntDuration(3, 15, 5) }),
 
+  // Kling O3 4K: same rules as v3
+  "kling-v3-4k": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
+  // Kling O3 reference-to-video: same duration range
+  "kling-v3-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+  "kling-v3-4k-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
+  // Kling O3 video-to-video reference: same duration range
+  "kling-v3-v2v-ref": z.object({ duration: stringIntDuration(3, 15, 5) }),
+
   // Kling v2.6: same rules as v3
   "kling-v2.6": z.object({ duration: stringIntDuration(3, 15, 5) }),
 

diff --git a/src/core/registry/resolver.ts b/src/core/registry/resolver.ts
@@ -40,6 +40,10 @@ const ALIASES: Record<string, string> = {
   stt: "speech-to-text",
   voice: "text-to-speech",
 
+  // Video-to-video
+  v2v: "video-to-video",
+  vid2vid: "video-to-video",
+
   // Video editing
   concat: "merge",
   join: "merge",

diff --git a/src/definitions/actions/video.ts b/src/definitions/actions/video.ts
@@ -19,6 +19,9 @@ const videoInputSchema = z.object({
   image: filePathSchema
     .optional()
     .describe("Input image (enables image-to-video)"),
+  video: filePathSchema
+    .optional()
+    .describe("Input video for video-to-video (preserves motion/camera style)"),
   duration: videoDurationSchema
     .default(5)
     .describe("Video duration in seconds"),
@@ -42,7 +45,7 @@ const schema: ZodSchema<typeof videoInputSchema, typeof videoOutputSchema> = {
 export const definition: ActionDefinition<typeof schema> = {
   type: "action",
   name: "video",
-  description: "Generate video from text or image",
+  description: "Generate video from text, image, or video",
   schema,
   routes: [
     {
@@ -51,12 +54,29 @@ export const definition: ActionDefinition<typeof schema> = {
     },
   ],
   execute: async (inputs) => {
-    // inputs is now fully typed as VideoInput - no more `as` cast!
-    const { prompt, image, duration, aspectRatio } = inputs;
+    const { prompt, image, video, duration, aspectRatio } = inputs;
 
     let result: { data?: { video?: { url?: string }; duration?: number } };
 
-    if (image) {
+    if (video && image) {
+      // Video + image → motion control (transfer motion to character)
+      console.log(
+        "[action/video] generating motion control video (image + video)",
+      );
+      result = await falProvider.motionControl({
+        prompt,
+        imageUrl: image,
+        videoUrl: video,
+      });
+    } else if (video) {
+      // Video only → video-to-video reference (preserve motion/camera)
+      console.log("[action/video] generating video-to-video reference");
+      result = await falProvider.videoToVideoReference({
+        prompt,
+        videoUrl: video,
+        duration,
+      });
+    } else if (image) {
       console.log("[action/video] generating video from image");
       result = await falProvider.imageToVideo({
         prompt,

diff --git a/src/definitions/models/index.ts b/src/definitions/models/index.ts
@@ -11,7 +11,15 @@ import type {
 export { definition as elevenlabsTts } from "./elevenlabs";
 export { definition as flux } from "./flux";
 export { definition as heygenAvatar } from "./heygen";
-export { definition as kling } from "./kling";
+export {
+  definition as kling,
+  kling4kDefinition as kling4k,
+  kling4kRefDefinition as kling4kRef,
+  klingRefDefinition as klingRef,
+  klingV2VRefDefinition as klingV2VRef,
+  klingV3MotionDefinition as klingV3Motion,
+  klingV3MotionStdDefinition as klingV3MotionStd,
+} from "./kling";
 export { definition as llama } from "./llama";
 export { definition as ltxA2v } from "./ltx-a2v";
 export { definition as nanoBanana2 } from "./nano-banana-2";
@@ -39,7 +47,15 @@ export { definition as whisper } from "./whisper";
 import { definition as elevenlabsDefinition } from "./elevenlabs";
 import { definition as fluxDefinition } from "./flux";
 import { definition as heygenAvatarDefinition } from "./heygen";
-import { definition as klingDefinition } from "./kling";
+import {
+  kling4kDefinition,
+  kling4kRefDefinition,
+  definition as klingDefinition,
+  klingRefDefinition,
+  klingV2VRefDefinition,
+  klingV3MotionDefinition,
+  klingV3MotionStdDefinition,
+} from "./kling";
 import { definition as llamaDefinition } from "./llama";
 import { definition as ltxA2vDefinition } from "./ltx-a2v";
 import { definition as nanoBanana2Definition } from "./nano-banana-2";
@@ -65,6 +81,12 @@ import { definition as whisperDefinition } from "./whisper";
 
 export const allModels = [
   klingDefinition,
+  kling4kDefinition,
+  klingRefDefinition,
+  kling4kRefDefinition,
+  klingV2VRefDefinition,
+  klingV3MotionDefinition,
+  klingV3MotionStdDefinition,
   fluxDefinition,
   nanoBananaProDefinition,
   nanoBanana2Definition,