From 2f4e8e63ead2c1548c13cf85287b2c3e412ae0ae Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Wed, 8 Jan 2025 16:47:43 +0530 Subject: [PATCH 01/13] feat:initial implementation to add support for LTX-Video model --- runner/app/pipelines/image_to_video.py | 35 ++++++++++++++++++++++++-- runner/app/routes/image_to_video.py | 19 ++++++++++++++ runner/dl_checkpoints.sh | 5 ++-- 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 96841ba16..4f5e3c647 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -5,7 +5,7 @@ import PIL import torch -from diffusers import StableVideoDiffusionPipeline +from diffusers import LTXImageToVideoPipeline, StableVideoDiffusionPipeline from huggingface_hub import file_download from PIL import ImageFile @@ -22,6 +22,8 @@ class ImageToVideoPipeline(Pipeline): def __init__(self, model_id: str): + self.pipeline_name = "" + self.model_id = model_id kwargs = {"cache_dir": get_model_dir()} @@ -41,7 +43,28 @@ def __init__(self, model_id: str): kwargs["torch_dtype"] = torch.float16 kwargs["variant"] = "fp16" - self.ldm = StableVideoDiffusionPipeline.from_pretrained(model_id, **kwargs) + try: + if any(substring in model_id.lower() for substring in ("ltx-video", "ltx")): + logger.info("Loading LTXImageToVideoPipeline for model_id: %s", model_id) + self.pipeline_name = "LTXImageToVideoPipeline" + self.ldm = LTXImageToVideoPipeline.from_pretrained(model_id, **kwargs) + else: + logger.info("Loading StableVideoDiffusionPipeline for model_id: %s", model_id) + self.pipeline_name = "StableVideoDiffusionPipeline" + self.ldm = StableVideoDiffusionPipeline.from_pretrained(model_id, **kwargs) + except Exception as loading_error: + logger.error("Failed to load %s : %s." %(self.pipeline_name,loading_error)) + # Trying to load the LTXImageToVideoPipeline if the StableVideoDiffusionPipeline fails to load and there is a chance that model name doesn't match the if condition for LTX-Video + # (for future extra models support) + try: + logger.info("Trying LTXImageToVideoPipeline for model_id: %s", model_id) + self.pipeline_name = "LTXImageToVideoPipeline" + self.ldm = LTXImageToVideoPipeline.from_pretrained(model_id, **kwargs) + except Exception as loading_error: + logger.error("Failed to load both LTXImageToVideoPipeline and StableVideoDiffusionPipeline: %s. Please ensure the model ID is compatible.", loading_error) + raise loading_error + + self.ldm.to(get_torch_device()) sfast_enabled = os.getenv("SFAST", "").strip().lower() == "true" @@ -113,6 +136,14 @@ def __call__( seed = kwargs.pop("seed", None) safety_check = kwargs.pop("safety_check", True) + if self.pipeline_name == "LTXImageToVideoPipeline": + del kwargs["fps"] + del kwargs["motion_bucket_id"] + del kwargs["noise_aug_strength"] + elif self.pipeline_name == "StableVideoDiffusionPipeline": + del kwargs["prompt"] + del kwargs["negative_prompt"] + if "decode_chunk_size" not in kwargs: # Decrease decode_chunk_size to reduce memory usage. kwargs["decode_chunk_size"] = 4 diff --git a/runner/app/routes/image_to_video.py b/runner/app/routes/image_to_video.py index eb64a3ef1..7a410c6a3 100644 --- a/runner/app/routes/image_to_video.py +++ b/runner/app/routes/image_to_video.py @@ -74,6 +74,19 @@ async def image_to_video( UploadFile, File(description="Uploaded image to generate a video from."), ], + prompt: Annotated[ + str, + Form(description="Text prompt(s) to guide video generation for prompt accepting models.") + ] = "", + negative_prompt: Annotated[ + str, + Form( + description=( + "Text prompt(s) to guide what to exclude from video generation for prompt accepting models. " + "Ignored if guidance_scale < 1." + ) + ), + ] = "", model_id: Annotated[ str, Form(description="Hugging Face model ID used for video generation.") ] = "", @@ -123,6 +136,9 @@ async def image_to_video( ) ), ] = 25, # NOTE: Hardcoded due to varying pipeline values. + num_frames: Annotated[ + int, Form(description="The number of video frames to generate.") + ] = 25, # NOTE: Added `25` as default value to consider for `stable-video-diffusion-img2vid-xt` model having smaller default value than LTX-V in its pipeline. pipeline: Pipeline = Depends(get_pipeline), token: HTTPAuthorizationCredentials = Depends(HTTPBearer(auto_error=False)), ): @@ -159,6 +175,9 @@ async def image_to_video( try: batch_frames, has_nsfw_concept = pipeline( image=Image.open(image.file).convert("RGB"), + prompt=prompt, + negative_prompt=negative_prompt, + num_frames=num_frames, height=height, width=width, fps=fps, diff --git a/runner/dl_checkpoints.sh b/runner/dl_checkpoints.sh index ccfa95d42..ad7818c06 100755 --- a/runner/dl_checkpoints.sh +++ b/runner/dl_checkpoints.sh @@ -78,8 +78,9 @@ function download_all_models() { huggingface-cli download SG161222/Realistic_Vision_V6.0_B1_noVAE --include "*.fp16.safetensors" "*.json" "*.txt" "*.bin" --exclude ".onnx" ".onnx_data" --cache-dir models huggingface-cli download black-forest-labs/FLUX.1-schnell --include "*.safetensors" "*.json" "*.txt" "*.model" --exclude ".onnx" ".onnx_data" --cache-dir models - # Download image-to-video models. - huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --include "*.fp16.safetensors" "*.json" --cache-dir models + # Download image-to-video models. + huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --include "*.fp16.safetensors" "*.json" --cache-dir models + huggingface-cli download Lightricks/LTX-Video --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models # Download image-to-text models. huggingface-cli download Salesforce/blip-image-captioning-large --include "*.safetensors" "*.json" --cache-dir models From 9f1737db77c9b3ca977acc8a67d774a907b5a1cc Mon Sep 17 00:00:00 2001 From: Brad P Date: Tue, 18 Mar 2025 11:00:17 -0500 Subject: [PATCH 02/13] chore:add extra needed inputs for LTX-Video model --- runner/gateway.openapi.yaml | 16 ++++++++++++++++ runner/openapi.yaml | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/runner/gateway.openapi.yaml b/runner/gateway.openapi.yaml index e4272356a..abe628f64 100644 --- a/runner/gateway.openapi.yaml +++ b/runner/gateway.openapi.yaml @@ -674,6 +674,17 @@ components: format: binary title: Image description: Uploaded image to generate a video from. + prompt: + type: string + title: Prompt + description: Text prompt(s) to guide video generation for prompt accepting models. + default: '' + negative_prompt: + type: string + title: Negative Prompt + description: Text prompt(s) to guide what to exclude from video generation for prompt accepting models. + Ignored if guidance_scale < 1. + default: '' model_id: type: string title: Model Id @@ -722,6 +733,11 @@ components: description: Number of denoising steps. More steps usually lead to higher quality images but slower inference. Modulated by strength. default: 25 + num_frames: + type: integer + title: Num Frames + description: The number of video frames to generate. + default: 25 type: object required: - image diff --git a/runner/openapi.yaml b/runner/openapi.yaml index 196f98f03..6e6309160 100644 --- a/runner/openapi.yaml +++ b/runner/openapi.yaml @@ -709,6 +709,17 @@ components: format: binary title: Image description: Uploaded image to generate a video from. + prompt: + type: string + title: Prompt + description: Text prompt(s) to guide video generation for prompt accepting models. + default: '' + negative_prompt: + type: string + title: Negative Prompt + description: Text prompt(s) to guide what to exclude from video generation for prompt accepting models. + Ignored if guidance_scale < 1. + default: '' model_id: type: string title: Model Id @@ -757,6 +768,11 @@ components: description: Number of denoising steps. More steps usually lead to higher quality images but slower inference. Modulated by strength. default: 25 + num_frames: + type: integer + title: Num Frames + description: The number of video frames to generate. + default: 25 type: object required: - image From c8f189408356a8044c97c96bfecb1d5fb802742d Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Fri, 10 Jan 2025 21:02:10 +0530 Subject: [PATCH 03/13] chore:add suggested changes for kwargs deletion --- runner/app/pipelines/image_to_video.py | 29 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 4f5e3c647..2ae4570eb 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -1,7 +1,8 @@ import logging +import inspect import os import time -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Type import PIL import torch @@ -136,14 +137,6 @@ def __call__( seed = kwargs.pop("seed", None) safety_check = kwargs.pop("safety_check", True) - if self.pipeline_name == "LTXImageToVideoPipeline": - del kwargs["fps"] - del kwargs["motion_bucket_id"] - del kwargs["noise_aug_strength"] - elif self.pipeline_name == "StableVideoDiffusionPipeline": - del kwargs["prompt"] - del kwargs["negative_prompt"] - if "decode_chunk_size" not in kwargs: # Decrease decode_chunk_size to reduce memory usage. kwargs["decode_chunk_size"] = 4 @@ -163,6 +156,13 @@ def __call__( ): del kwargs["num_inference_steps"] + if self.pipeline_name == "LTXImageToVideoPipeline": + pipeline_class = LTXImageToVideoPipeline + elif self.pipeline_name == "StableVideoDiffusionPipeline": + pipeline_class = StableVideoDiffusionPipeline + + kwargs = self._filter_valid_kwargs(pipeline_class, kwargs) + if safety_check: _, has_nsfw_concept = self._safety_checker.check_nsfw_images([image]) else: @@ -177,5 +177,14 @@ def __call__( return outputs.frames, has_nsfw_concept + @staticmethod + def _filter_valid_kwargs(pipeline_class: Type, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """ + Filters the kwargs to just include keys that are necesssary for the pipeline_class. + """ + + valid_kwargs = inspect.signature(pipeline_class.__call__).parameters.keys() + return {k: v for k, v in kwargs.items() if k in valid_kwargs} + def __str__(self) -> str: - return f"ImageToVideoPipeline model_id={self.model_id}" + return f"ImageToVideoPipeline model_id={self.model_id}" \ No newline at end of file From 4f3b839c1a31cf557b8acb1771b88c7fec02f875 Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Fri, 10 Jan 2025 21:04:38 +0530 Subject: [PATCH 04/13] style:add line at EOF --- runner/app/pipelines/image_to_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 2ae4570eb..8c02543ee 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -187,4 +187,4 @@ def _filter_valid_kwargs(pipeline_class: Type, kwargs: Dict[str, Any]) -> Dict[s return {k: v for k, v in kwargs.items() if k in valid_kwargs} def __str__(self) -> str: - return f"ImageToVideoPipeline model_id={self.model_id}" \ No newline at end of file + return f"ImageToVideoPipeline model_id={self.model_id}" From 2f7407775cb2d968d88620e51de0f6ff674b2038 Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Sat, 11 Jan 2025 02:12:14 +0530 Subject: [PATCH 05/13] chore:disable deepcache and sfast for LTXImageToVideoPipeline as it is not supported --- runner/app/pipelines/image_to_video.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 8c02543ee..7d7007908 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -76,7 +76,9 @@ def __init__(self, model_id: str): "as it may lead to suboptimal performance. Please disable one of them." ) - if sfast_enabled: + if sfast_enabled and self.pipeline_name == "LTXImageToVideoPipeline": + logger.warning("StableFast optimization is not compatible with LTXImageToVideoPipeline so,skipping.") + elif sfast_enabled: logger.info( "ImageToVideoPipeline will be dynamically compiled with stable-fast " "for %s", @@ -119,9 +121,11 @@ def __init__(self, model_id: str): ) logger.info("Total warmup time: %s seconds", total_time) - if deepcache_enabled: + if deepcache_enabled and self.pipeline_name == "LTXImageToVideoPipeline": + logger.warning("DeepCache optimization is not compatible with LTXImageToVideoPipeline so,skipping.") + elif deepcache_enabled: logger.info( - "TextToImagePipeline will be optimized with DeepCache for %s", + "ImageToVideoPipeline will be optimized with DeepCache for %s", model_id, ) from app.pipelines.optim.deepcache import enable_deepcache From d60cdcf68aab84c7de37e98d632704617ef49ae2 Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Sat, 11 Jan 2025 02:19:06 +0530 Subject: [PATCH 06/13] chore:upgrade diffusers requirement --- runner/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/requirements.txt b/runner/requirements.txt index 0f37d12a2..4708a92a2 100644 --- a/runner/requirements.txt +++ b/runner/requirements.txt @@ -1,4 +1,4 @@ -diffusers==0.31.0 +diffusers==0.32.1 accelerate==0.30.1 transformers==4.43.3 fastapi==0.111.0 From c0f5ffbe9dc59b05cd091623359adede504f0867 Mon Sep 17 00:00:00 2001 From: RUFFY-369 Date: Sat, 11 Jan 2025 15:40:55 +0530 Subject: [PATCH 07/13] chore:suggested changes to make pipeline more generic --- runner/app/pipelines/image_to_video.py | 31 ++++++++------------------ 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 7d7007908..0435de5d7 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -6,7 +6,7 @@ import PIL import torch -from diffusers import LTXImageToVideoPipeline, StableVideoDiffusionPipeline +from diffusers import DiffusionPipeline, LTXImageToVideoPipeline, StableVideoDiffusionPipeline from huggingface_hub import file_download from PIL import ImageFile @@ -44,30 +44,17 @@ def __init__(self, model_id: str): kwargs["torch_dtype"] = torch.float16 kwargs["variant"] = "fp16" - try: - if any(substring in model_id.lower() for substring in ("ltx-video", "ltx")): - logger.info("Loading LTXImageToVideoPipeline for model_id: %s", model_id) - self.pipeline_name = "LTXImageToVideoPipeline" - self.ldm = LTXImageToVideoPipeline.from_pretrained(model_id, **kwargs) - else: - logger.info("Loading StableVideoDiffusionPipeline for model_id: %s", model_id) - self.pipeline_name = "StableVideoDiffusionPipeline" - self.ldm = StableVideoDiffusionPipeline.from_pretrained(model_id, **kwargs) - except Exception as loading_error: - logger.error("Failed to load %s : %s." %(self.pipeline_name,loading_error)) - # Trying to load the LTXImageToVideoPipeline if the StableVideoDiffusionPipeline fails to load and there is a chance that model name doesn't match the if condition for LTX-Video - # (for future extra models support) - try: - logger.info("Trying LTXImageToVideoPipeline for model_id: %s", model_id) - self.pipeline_name = "LTXImageToVideoPipeline" - self.ldm = LTXImageToVideoPipeline.from_pretrained(model_id, **kwargs) - except Exception as loading_error: - logger.error("Failed to load both LTXImageToVideoPipeline and StableVideoDiffusionPipeline: %s. Please ensure the model ID is compatible.", loading_error) - raise loading_error - + logger.info("Loading DiffusionPipeline for model_id: %s", model_id) + self.ldm = DiffusionPipeline.from_pretrained(model_id, **kwargs) + + if any(substring in model_id.lower() for substring in ("ltx-video", "ltx")): + logger.info("Adjusting to LTXImageToVideoPipeline for model_id: %s", model_id) + self.ldm = LTXImageToVideoPipeline.from_pipe(self.ldm) self.ldm.to(get_torch_device()) + self.pipeline_name = type(self.ldm).__name__ + sfast_enabled = os.getenv("SFAST", "").strip().lower() == "true" deepcache_enabled = os.getenv("DEEPCACHE", "").strip().lower() == "true" if sfast_enabled and deepcache_enabled: From cd32c055324045c7ce8c958c39806d5d7b13dd84 Mon Sep 17 00:00:00 2001 From: Brad P Date: Tue, 18 Mar 2025 11:27:56 -0500 Subject: [PATCH 08/13] add model offload for efficient ram usage --- runner/app/pipelines/image_to_video.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 0435de5d7..dbd9a98b6 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -50,8 +50,9 @@ def __init__(self, model_id: str): if any(substring in model_id.lower() for substring in ("ltx-video", "ltx")): logger.info("Adjusting to LTXImageToVideoPipeline for model_id: %s", model_id) self.ldm = LTXImageToVideoPipeline.from_pipe(self.ldm) - - self.ldm.to(get_torch_device()) + self.ldm.enable_model_cpu_offload() + else: + self.ldm.to(get_torch_device()) self.pipeline_name = type(self.ldm).__name__ From 61d1783a7f8d623b1d246fc0dbfeef906a726bfe Mon Sep 17 00:00:00 2001 From: Brad P Date: Mon, 24 Mar 2025 10:37:24 -0500 Subject: [PATCH 09/13] fix --- runner/app/pipelines/image_to_video.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index dbd9a98b6..205eea9ca 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -51,6 +51,10 @@ def __init__(self, model_id: str): logger.info("Adjusting to LTXImageToVideoPipeline for model_id: %s", model_id) self.ldm = LTXImageToVideoPipeline.from_pipe(self.ldm) self.ldm.enable_model_cpu_offload() + self.ldm.vae.enable_tiling() + LOW_VRAM = os.getenv("USE_LOW_VRAM", "false") + if LOW_VRAM == "true": + self.ldm.enable_sequential_cpu_offload() else: self.ldm.to(get_torch_device()) From 4e93f04e771a2c5047caf6d8992ca027159ef1c9 Mon Sep 17 00:00:00 2001 From: Brad P Date: Thu, 27 Mar 2025 03:52:46 -0500 Subject: [PATCH 10/13] change to VAE slicing to save on some vram --- runner/app/pipelines/image_to_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/app/pipelines/image_to_video.py b/runner/app/pipelines/image_to_video.py index 205eea9ca..ec1a854ef 100644 --- a/runner/app/pipelines/image_to_video.py +++ b/runner/app/pipelines/image_to_video.py @@ -51,7 +51,7 @@ def __init__(self, model_id: str): logger.info("Adjusting to LTXImageToVideoPipeline for model_id: %s", model_id) self.ldm = LTXImageToVideoPipeline.from_pipe(self.ldm) self.ldm.enable_model_cpu_offload() - self.ldm.vae.enable_tiling() + self.ldm.vae.enable_slicing() LOW_VRAM = os.getenv("USE_LOW_VRAM", "false") if LOW_VRAM == "true": self.ldm.enable_sequential_cpu_offload() From b3e9a2e17f0bf10b63f661a991975df987efd205 Mon Sep 17 00:00:00 2001 From: Brad P Date: Thu, 27 Mar 2025 03:53:10 -0500 Subject: [PATCH 11/13] update openapi generated spec for description updates --- runner/gateway.openapi.yaml | 7 ++++--- runner/openapi.yaml | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/runner/gateway.openapi.yaml b/runner/gateway.openapi.yaml index abe628f64..682cf6103 100644 --- a/runner/gateway.openapi.yaml +++ b/runner/gateway.openapi.yaml @@ -677,13 +677,14 @@ components: prompt: type: string title: Prompt - description: Text prompt(s) to guide video generation for prompt accepting models. + description: Text prompt(s) to guide video generation for prompt accepting + models. default: '' negative_prompt: type: string title: Negative Prompt - description: Text prompt(s) to guide what to exclude from video generation for prompt accepting models. - Ignored if guidance_scale < 1. + description: Text prompt(s) to guide what to exclude from video generation + for prompt accepting models. Ignored if guidance_scale < 1. default: '' model_id: type: string diff --git a/runner/openapi.yaml b/runner/openapi.yaml index 6e6309160..80a2bdd96 100644 --- a/runner/openapi.yaml +++ b/runner/openapi.yaml @@ -712,13 +712,14 @@ components: prompt: type: string title: Prompt - description: Text prompt(s) to guide video generation for prompt accepting models. + description: Text prompt(s) to guide video generation for prompt accepting + models. default: '' negative_prompt: type: string title: Negative Prompt - description: Text prompt(s) to guide what to exclude from video generation for prompt accepting models. - Ignored if guidance_scale < 1. + description: Text prompt(s) to guide what to exclude from video generation + for prompt accepting models. Ignored if guidance_scale < 1. default: '' model_id: type: string From 9a9cddece9c55fb8e02221f4aac7713cdc4fb6c0 Mon Sep 17 00:00:00 2001 From: Brad | ad-astra <99882368+ad-astra-video@users.noreply.github.com> Date: Thu, 17 Apr 2025 09:14:54 -0500 Subject: [PATCH 12/13] Update requirements.txt update to latest diffusers and transformers --- runner/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runner/requirements.txt b/runner/requirements.txt index 4708a92a2..849ff590c 100644 --- a/runner/requirements.txt +++ b/runner/requirements.txt @@ -1,6 +1,6 @@ -diffusers==0.32.1 +diffusers==0.33.1 accelerate==0.30.1 -transformers==4.43.3 +transformers==4.51.3 fastapi==0.111.0 pydantic==2.7.2 Pillow==10.3.0 From bcf2454d2c994d46a1536abf37fad3e887e7fd34 Mon Sep 17 00:00:00 2001 From: Brad P Date: Thu, 29 May 2025 15:18:42 -0500 Subject: [PATCH 13/13] fix dependencies issues --- runner/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner/requirements.txt b/runner/requirements.txt index 571b18cf1..bf71a1a02 100644 --- a/runner/requirements.txt +++ b/runner/requirements.txt @@ -6,7 +6,7 @@ pydantic==2.7.2 Pillow==10.3.0 python-multipart==0.0.9 uvicorn==0.30.0 -huggingface_hub==0.23.2 +huggingface_hub>=0.27.0 xformers==0.0.23 triton>=2.1.0 peft==0.11.1