diff --git a/src/react/elements.ts b/src/react/elements.ts index e3ff30b3..c620ca7a 100644 --- a/src/react/elements.ts +++ b/src/react/elements.ts @@ -2,6 +2,7 @@ import { resolveImageElement, resolveMusicElement, resolveSpeechElement, + resolveTalkingHeadElement, resolveVideoElement, } from "./resolve"; import type { ResolvedElement } from "./resolved-element"; @@ -141,11 +142,14 @@ export function Speech( export function TalkingHead( props: TalkingHeadProps, -): VargElement<"talking-head"> { - return createElement( +): VargElement<"talking-head"> & PromiseLike> { + const element = createElement( "talking-head", props as Record, - props.children, + undefined, + ); + return makeThenable(element, (el) => + resolveTalkingHeadElement(el, el.props as unknown as TalkingHeadProps), ); } diff --git a/src/react/renderers/clip.ts b/src/react/renderers/clip.ts index d52cc898..a4a5c4e5 100644 --- a/src/react/renderers/clip.ts +++ b/src/react/renderers/clip.ts @@ -26,6 +26,7 @@ import { renderSlider } from "./slider"; import { renderSpeech } from "./speech"; import { renderSubtitle } from "./subtitle"; import { renderSwipe } from "./swipe"; +import { renderTalkingHead } from "./talking-head"; import { renderTitle } from "./title"; import { resolvePath } from "./utils"; import { renderVideo } from "./video"; @@ -151,6 +152,27 @@ async function renderClipLayers( break; } + case "talking-head": { + pending.push({ + type: "async", + promise: renderTalkingHead( + element as VargElement<"talking-head">, + ctx, + ) + .then((file) => ctx.backend.resolvePath(file)) + .then( + (path) => + ({ + type: "video", + path, + resizeMode: "cover", + mixVolume: 1, + }) as VideoLayer, + ), + }); + break; + } + case "music": { const props = element.props as MusicProps; pending.push({ diff --git a/src/react/renderers/talking-head.test.ts b/src/react/renderers/talking-head.test.ts new file mode 100644 index 00000000..2400d68e --- /dev/null +++ b/src/react/renderers/talking-head.test.ts @@ -0,0 +1,362 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { ImageModelV3, SpeechModelV3 } from "@ai-sdk/provider"; +import { withCache } from "../../ai-sdk/cache"; +import { File } from "../../ai-sdk/file"; +import { fileCache } from "../../ai-sdk/file-cache"; +import { localBackend } from "../../ai-sdk/providers/editly"; +import type { VideoModelV3 } from "../../ai-sdk/video-model"; +import { Image, type Speech, TalkingHead } from "../elements"; +import { ResolvedElement } from "../resolved-element"; +import { renderClip } from "./clip"; +import type { RenderContext } from "./context"; +import { renderTalkingHead } from "./talking-head"; + +function makeTempDir(): string { + return mkdtempSync(join(tmpdir(), "varg-talking-head-test-")); +} + +function cleanupTempDir(dir: string) { + rmSync(dir, { recursive: true, force: true }); +} + +function createImageModel(): ImageModelV3 { + return { + specificationVersion: "v3", + provider: "test", + modelId: "test-image", + maxImagesPerCall: 1, + async doGenerate() { + return { + images: [new Uint8Array([0x89, 0x50, 0x4e, 0x47, 1, 2, 3])], + warnings: [], + response: { + timestamp: new Date(), + modelId: "test-image", + headers: undefined, + }, + }; + }, + }; +} + +function createVideoModel(): VideoModelV3 { + return { + specificationVersion: "v3", + provider: "test", + modelId: "test-video", + maxVideosPerCall: 1, + async doGenerate() { + return { + videos: [new Uint8Array([0, 0, 0, 0x1c, 0x66, 0x74, 0x79, 0x70])], + warnings: [], + response: { + timestamp: new Date(), + modelId: "test-video", + headers: undefined, + }, + }; + }, + }; +} + +function createSpeechModel(): SpeechModelV3 { + return { + specificationVersion: "v3", + provider: "test", + modelId: "test-speech", + async doGenerate() { + return { + audio: new Uint8Array([0xff, 0xfb, 0x90, 4, 5, 6]), + warnings: [], + response: { + timestamp: new Date(), + modelId: "test-speech", + headers: undefined, + }, + }; + }, + }; +} + +type GenerateImageOptions = Parameters[0]; +type GenerateVideoOptions = Parameters[0]; + +function createContext( + cacheDir: string, + counters: { imageCalls: number; videoCalls: number; speechCalls: number }, +): RenderContext { + const storage = fileCache({ dir: cacheDir }); + + const generateImage = withCache( + async (_opts: GenerateImageOptions) => { + counters.imageCalls += 1; + return { + images: [ + { + uint8Array: new Uint8Array([0x89, 0x50, 0x4e, 0x47, 1, 2, 3]), + mimeType: "image/png", + }, + ], + warnings: [], + }; + }, + { storage }, + ); + + const generateVideo = withCache( + async (_opts: GenerateVideoOptions) => { + counters.videoCalls += 1; + const data = new Uint8Array([0, 0, 0, 0x1c, 0x66, 0x74, 0x79, 0x70]); + return { + video: { uint8Array: data, mimeType: "video/mp4" }, + videos: [{ uint8Array: data, mimeType: "video/mp4" }], + warnings: [], + }; + }, + { storage }, + ); + + return { + width: 1080, + height: 1920, + fps: 30, + cache: storage, + generateImage: generateImage as unknown as RenderContext["generateImage"], + generateVideo: generateVideo as unknown as RenderContext["generateVideo"], + tempFiles: [], + pendingFiles: new Map>(), + backend: localBackend, + generatedFiles: [], + defaults: { + image: createImageModel(), + video: createVideoModel(), + speech: createSpeechModel(), + }, + }; +} + +/** Helper: create a mock resolved image element */ +function createResolvedImage(): ResolvedElement<"image"> { + const mockFile = File.fromGenerated({ + uint8Array: new Uint8Array([0x89, 0x50, 0x4e, 0x47, 1, 2, 3]), + mediaType: "image/png", + }); + return new ResolvedElement<"image">( + { type: "image", props: { prompt: "test" }, children: [] }, + { file: mockFile, duration: 0 }, + ); +} + +/** Helper: create a mock resolved speech element */ +function createResolvedSpeech(): ResolvedElement<"speech"> { + const mockFile = File.fromGenerated({ + uint8Array: new Uint8Array([0xff, 0xfb, 0x90, 4, 5, 6]), + mediaType: "audio/mpeg", + }); + return new ResolvedElement<"speech">( + { type: "speech", props: { voice: "rachel" }, children: ["Hello world!"] }, + { file: mockFile, duration: 3 }, + ); +} + +describe("TalkingHead element", () => { + test("creates correct element structure with image/audio props", () => { + const image = Image({ + prompt: "young woman, casual outfit", + model: createImageModel(), + }); + const element = TalkingHead({ + image, + audio: createResolvedSpeech(), + model: createVideoModel(), + }); + + expect(element.type).toBe("talking-head"); + expect(element.props.image).toBeDefined(); + expect(element.props.audio).toBeDefined(); + }); + + test("is thenable (awaitable)", () => { + const element = TalkingHead({ + image: createResolvedImage() as unknown as ReturnType, + audio: createResolvedSpeech() as unknown as ReturnType, + model: createVideoModel(), + }); + + expect(typeof (element as any).then).toBe("function"); + }); + + test("clip renderer handles pre-resolved TalkingHead element", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + + // Create a pre-resolved TalkingHead element (simulating `await TalkingHead(...)`) + const mockVideoFile = File.fromGenerated({ + uint8Array: new Uint8Array([0, 0, 0, 0x1c, 0x66, 0x74, 0x79, 0x70]), + mediaType: "video/mp4", + }); + + const resolvedTalkingHead = new ResolvedElement<"talking-head">( + { + type: "talking-head", + props: { + image: createResolvedImage(), + audio: createResolvedSpeech(), + }, + children: [], + }, + { + file: mockVideoFile, + duration: 5, + }, + ); + + // Use it inside a Clip + const clip = { + type: "clip" as const, + props: { duration: 5 }, + children: [resolvedTalkingHead], + }; + + const result = await renderClip(clip as any, ctx); + + // The clip should contain a video layer + expect(result.layers.length).toBeGreaterThan(0); + expect(result.layers.some((l) => l.type === "video")).toBe(true); + } finally { + cleanupTempDir(cacheDir); + } + }); +}); + +describe("renderTalkingHead", () => { + test("throws when no image prop provided", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + const element = TalkingHead({ + audio: createResolvedSpeech() as unknown as ReturnType, + model: createVideoModel(), + // no image — should throw + } as any); + + await expect(renderTalkingHead(element as any, ctx)).rejects.toThrow( + "TalkingHead requires 'image' prop", + ); + } finally { + cleanupTempDir(cacheDir); + } + }); + + test("throws when no audio prop provided", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + const element = TalkingHead({ + image: createResolvedImage() as unknown as ReturnType, + model: createVideoModel(), + // no audio — should throw + } as any); + + await expect(renderTalkingHead(element as any, ctx)).rejects.toThrow( + "TalkingHead requires 'audio' prop", + ); + } finally { + cleanupTempDir(cacheDir); + } + }); + + test("throws when no model and no defaults.video", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + // Remove defaults.video to test error + ctx.defaults = { image: createImageModel(), speech: createSpeechModel() }; + + const element = TalkingHead({ + image: createResolvedImage() as unknown as ReturnType, + audio: createResolvedSpeech() as unknown as ReturnType, + // no model + } as any); + + await expect(renderTalkingHead(element as any, ctx)).rejects.toThrow( + "TalkingHead requires 'model' prop", + ); + } finally { + cleanupTempDir(cacheDir); + } + }); + + test("renders with pre-resolved image and audio (skips generation)", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + + const resolvedImage = createResolvedImage(); + const resolvedSpeech = createResolvedSpeech(); + + const element = TalkingHead({ + image: resolvedImage as unknown as ReturnType, + audio: resolvedSpeech as unknown as ReturnType, + model: createVideoModel(), + } as any); + + const result = await renderTalkingHead(element as any, ctx); + + // Should NOT have called generateImage (image was pre-resolved) + expect(counters.imageCalls).toBe(0); + // Should have called generateVideo (for lipsync) + expect(counters.videoCalls).toBe(1); + expect(result).toBeDefined(); + } finally { + cleanupTempDir(cacheDir); + } + }); + + test("renders with lazy (non-awaited) image element", async () => { + const cacheDir = makeTempDir(); + const counters = { imageCalls: 0, videoCalls: 0, speechCalls: 0 }; + + try { + const ctx = createContext(cacheDir, counters); + + // Lazy image — not awaited, so it's a raw VargElement + const lazyImage = Image({ + prompt: "young woman, casual outfit", + model: createImageModel(), + }); + + const resolvedSpeech = createResolvedSpeech(); + + const element = TalkingHead({ + image: lazyImage, + audio: resolvedSpeech as unknown as ReturnType, + model: createVideoModel(), + } as any); + + const result = await renderTalkingHead(element as any, ctx); + + // Should have called generateImage (lazy image needed rendering) + expect(counters.imageCalls).toBe(1); + // Should have called generateVideo (for lipsync) + expect(counters.videoCalls).toBe(1); + expect(result).toBeDefined(); + } finally { + cleanupTempDir(cacheDir); + } + }); +}); diff --git a/src/react/renderers/talking-head.ts b/src/react/renderers/talking-head.ts new file mode 100644 index 00000000..b7792c42 --- /dev/null +++ b/src/react/renderers/talking-head.ts @@ -0,0 +1,105 @@ +import type { File } from "../../ai-sdk/file"; +import { ResolvedElement } from "../resolved-element"; +import type { TalkingHeadProps, VargElement } from "../types"; +import type { RenderContext } from "./context"; +import { renderImage } from "./image"; +import { renderSpeech } from "./speech"; +import { renderVideo } from "./video"; + +/** + * Render a TalkingHead element into a video file. + * + * Pipeline: + * 1. Resolve the character image from `image` prop (VargElement or ResolvedElement) + * 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement) + * 3. Generate a lipsync video via `model` (image + audio → video) + * + * The result is a video File suitable for use as a VideoLayer. + */ +export async function renderTalkingHead( + element: VargElement<"talking-head">, + ctx: RenderContext, +): Promise { + // If already resolved via `await TalkingHead(...)`, reuse the pre-generated file + if (element instanceof ResolvedElement) { + ctx.generatedFiles.push(element.meta.file); + return element.meta.file; + } + + const props = element.props as TalkingHeadProps; + + const model = props.model ?? ctx.defaults?.video; + if (!model) { + throw new Error( + "TalkingHead requires 'model' prop (or set defaults.video in render options)", + ); + } + + if (!props.image) { + throw new Error("TalkingHead requires 'image' prop (an Image element)"); + } + + if (!props.audio) { + throw new Error("TalkingHead requires 'audio' prop (a Speech element)"); + } + + // Step 1 & 2: Resolve character image and speech audio in parallel + const [characterFile, speechFile] = await Promise.all([ + resolveImageProp(props.image, ctx), + resolveAudioProp(props.audio, ctx), + ]); + + // Step 3: Generate lipsync video (image + audio → video) + const lipsyncModel = props.lipsyncModel ?? model; + const characterImageData = await characterFile.arrayBuffer(); + const speechAudioData = await speechFile.arrayBuffer(); + + // Create a synthetic video element for the lipsync generation + const videoElement: VargElement<"video"> = { + type: "video", + props: { + prompt: { + images: [characterImageData], + audio: speechAudioData, + }, + model: lipsyncModel, + keepAudio: true, + providerOptions: { fal: { resolution: props.resolution ?? "720p" } }, + }, + children: [], + }; + + return renderVideo(videoElement, ctx); +} + +/** + * Resolve an image prop — either a pre-resolved ResolvedElement<"image"> + * or a lazy VargElement<"image"> that needs rendering. + */ +async function resolveImageProp( + image: VargElement<"image">, + ctx: RenderContext, +): Promise { + if (image instanceof ResolvedElement) { + ctx.generatedFiles.push(image.meta.file); + return image.meta.file; + } + + return renderImage(image, ctx); +} + +/** + * Resolve an audio prop — either a pre-resolved ResolvedElement<"speech"> + * or a lazy VargElement<"speech"> that needs rendering. + */ +async function resolveAudioProp( + audio: VargElement<"speech">, + ctx: RenderContext, +): Promise { + if (audio instanceof ResolvedElement) { + ctx.generatedFiles.push(audio.meta.file); + return audio.meta.file; + } + + return renderSpeech(audio, ctx); +} diff --git a/src/react/resolve.ts b/src/react/resolve.ts index 2df8dd7b..28600cd2 100644 --- a/src/react/resolve.ts +++ b/src/react/resolve.ts @@ -37,6 +37,7 @@ import type { ImageProps, MusicProps, SpeechProps, + TalkingHeadProps, VargElement, } from "./types"; @@ -748,3 +749,95 @@ export async function resolveMusicElement( duration, }); } + +// --------------------------------------------------------------------------- +// TalkingHead +// --------------------------------------------------------------------------- +/** + * Resolve a TalkingHead element by combining a pre-resolved image and speech + * into a lipsync video. Returns a ResolvedElement<"talking-head"> wrapping the + * final video. + * + * Pipeline: + * 1. Resolve the image from `image` prop (generate or reuse pre-resolved) + * 2. Resolve the speech from `audio` prop (generate or reuse pre-resolved) + * 3. Generate lipsync video from image + audio via `model` + */ +export async function resolveTalkingHeadElement( + element: VargElement<"talking-head">, + props: TalkingHeadProps, +): Promise> { + const model = props.model; + if (!model) { + throw new Error( + "await TalkingHead() requires 'model' prop for lipsync video generation", + ); + } + + if (!props.image) { + throw new Error( + "await TalkingHead() requires 'image' prop (an Image element).", + ); + } + + if (!props.audio) { + throw new Error( + "await TalkingHead() requires 'audio' prop (a Speech element).", + ); + } + + // Step 1: Resolve image — if it's a ResolvedElement, use its file directly; + // otherwise resolve the lazy Image element via generateImage. + const resolvedImage = + props.image instanceof ResolvedElement + ? props.image + : await resolveImageElement(props.image, props.image.props as ImageProps); + const characterBytes = new Uint8Array(await resolvedImage.file.arrayBuffer()); + + // Step 2: Resolve speech — same pattern. + const resolvedSpeech = + props.audio instanceof ResolvedElement + ? props.audio + : await resolveSpeechElement( + props.audio, + props.audio.props as SpeechProps, + ); + const speechBytes = new Uint8Array(await resolvedSpeech.file.arrayBuffer()); + + // Step 3: Generate lipsync video (image + audio → video) + const lipsyncModel = props.lipsyncModel ?? model; + const generateVideo = getCachedGenerateVideo(); + + const { video } = await generateVideo({ + model: lipsyncModel as Parameters[0]["model"], + prompt: { + images: [characterBytes], + audio: speechBytes, + }, + duration: 0, // duration determined by audio length + }); + + const mediaType = video.mimeType ?? "video/mp4"; + const modelId = + typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId; + + const promptLabel = + getTextContent(element.children) ?? "talking-head lipsync"; + + const file = File.fromGenerated({ + uint8Array: video.uint8Array, + mediaType, + url: (video as { url?: string }).url, + }).withMetadata({ + type: "video", + model: modelId, + prompt: `talking-head: ${promptLabel.slice(0, 100)}`, + }); + + const duration = await probeDuration(file); + + return new ResolvedElement(element, { + file, + duration, + }); +} diff --git a/src/react/types.ts b/src/react/types.ts index 012c11c2..dd1e2756 100644 --- a/src/react/types.ts +++ b/src/react/types.ts @@ -198,16 +198,20 @@ export interface SpeechProps extends BaseProps, VolumeProps { } export interface TalkingHeadProps extends BaseProps { - character?: string; - src?: string; - voice?: string; + /** Pre-resolved or lazy image element to use as the character face. */ + image?: VargElement<"image">; + /** Pre-resolved or lazy speech element to use as the audio track. */ + audio?: VargElement<"speech">; + /** Lipsync video model (e.g. fal.videoModel("sync-v2-pro")). */ model?: VideoModelV3; + /** Separate lipsync model override (defaults to `model`). */ lipsyncModel?: VideoModelV3; + /** Video resolution for lipsync generation (default: "720p") */ + resolution?: "480p" | "720p" | "1080p"; position?: | Position | { left?: string; right?: string; top?: string; bottom?: string }; size?: { width: string; height: string }; - children?: string; } export interface TitleProps extends BaseProps {