From f2726b5fadfff55a38f1ae75ab21405336991d29 Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:06:48 +0000 Subject: [PATCH 1/7] Remove controlnets to test sdturbo perf (50FPS) --- runner/app/live/pipelines/streamdiffusion.py | 100 +++++++++---------- runner/app/live/trickle/decoder.py | 2 +- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py index aa0464293..195e818e5 100644 --- a/runner/app/live/pipelines/streamdiffusion.py +++ b/runner/app/live/pipelines/streamdiffusion.py @@ -77,54 +77,54 @@ class Config: # ControlNet settings controlnets: Optional[List[ControlNetConfig]] = [ - ControlNetConfig( - model_id="thibaud/controlnet-sd21-openpose-diffusers", - conditioning_scale=0.711, - preprocessor="pose_tensorrt", - preprocessor_params={}, - enabled=True, - control_guidance_start=0.0, - control_guidance_end=1.0, - ), - ControlNetConfig( - model_id="thibaud/controlnet-sd21-hed-diffusers", - conditioning_scale=0.2, - preprocessor="soft_edge", - preprocessor_params={}, - enabled=True, - control_guidance_start=0.0, - control_guidance_end=1.0, - ), - ControlNetConfig( - model_id="thibaud/controlnet-sd21-canny-diffusers", - conditioning_scale=0.2, - preprocessor="canny", - preprocessor_params={ - "low_threshold": 100, - "high_threshold": 200 - }, - enabled=True, - control_guidance_start=0.0, - control_guidance_end=1.0, - ), - ControlNetConfig( - model_id="thibaud/controlnet-sd21-depth-diffusers", - conditioning_scale=0.5, - preprocessor="depth_tensorrt", - preprocessor_params={}, - enabled=True, - control_guidance_start=0.0, - control_guidance_end=1.0, - ), - ControlNetConfig( - model_id="thibaud/controlnet-sd21-color-diffusers", - conditioning_scale=0.2, - preprocessor="passthrough", - preprocessor_params={}, - enabled=True, - control_guidance_start=0.0, - control_guidance_end=1.0, - ) + # ControlNetConfig( + # model_id="thibaud/controlnet-sd21-openpose-diffusers", + # conditioning_scale=0.711, + # preprocessor="pose_tensorrt", + # preprocessor_params={}, + # enabled=True, + # control_guidance_start=0.0, + # control_guidance_end=1.0, + # ), + # ControlNetConfig( + # model_id="thibaud/controlnet-sd21-hed-diffusers", + # conditioning_scale=0.2, + # preprocessor="soft_edge", + # preprocessor_params={}, + # enabled=True, + # control_guidance_start=0.0, + # control_guidance_end=1.0, + # ), + # ControlNetConfig( + # model_id="thibaud/controlnet-sd21-canny-diffusers", + # conditioning_scale=0.2, + # preprocessor="canny", + # preprocessor_params={ + # "low_threshold": 100, + # "high_threshold": 200 + # }, + # enabled=True, + # control_guidance_start=0.0, + # control_guidance_end=1.0, + # ), + # ControlNetConfig( + # model_id="thibaud/controlnet-sd21-depth-diffusers", + # conditioning_scale=0.5, + # preprocessor="depth_tensorrt", + # preprocessor_params={}, + # enabled=True, + # control_guidance_start=0.0, + # control_guidance_end=1.0, + # ), + # ControlNetConfig( + # model_id="thibaud/controlnet-sd21-color-diffusers", + # conditioning_scale=0.2, + # preprocessor="passthrough", + # preprocessor_params={}, + # enabled=True, + # control_guidance_start=0.0, + # control_guidance_end=1.0, + # ) ] @model_validator(mode="after") @@ -177,8 +177,8 @@ def process_tensor_sync(self, img_tensor: torch.Tensor): img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) img_tensor = self.pipe.preprocess_image(img_tensor) - # Noop if ControlNets are not enabled - self.pipe.update_control_image_efficient(img_tensor) + if self.params and self.params.controlnets: + self.pipe.update_control_image_efficient(img_tensor) if self.first_frame: self.first_frame = False diff --git a/runner/app/live/trickle/decoder.py b/runner/app/live/trickle/decoder.py index d240bdf96..d9b5796c4 100644 --- a/runner/app/live/trickle/decoder.py +++ b/runner/app/live/trickle/decoder.py @@ -9,7 +9,7 @@ from .frame import InputFrame -MAX_FRAMERATE=24 +MAX_FRAMERATE=120 def decode_av(pipe_input, frame_callback, put_metadata, target_width, target_height): """ From 4812d3f61e46e8cb675c97df209806fca26c2e4e Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:14:28 +0000 Subject: [PATCH 2/7] Move denormalization to outer process (~50FPS) --- runner/app/live/pipelines/streamdiffusion.py | 4 ++-- runner/app/live/streamer/process.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py index 195e818e5..d1297133f 100644 --- a/runner/app/live/pipelines/streamdiffusion.py +++ b/runner/app/live/pipelines/streamdiffusion.py @@ -173,8 +173,8 @@ def process_tensor_sync(self, img_tensor: torch.Tensor): # The incoming frame.tensor is (B, H, W, C) in range [-1, 1] while the # VaeImageProcessor inside the wrapper expects (B, C, H, W) in [0, 1]. - img_tensor = img_tensor.permute(0, 3, 1, 2) - img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) + # img_tensor = img_tensor.permute(0, 3, 1, 2) + # img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) img_tensor = self.pipe.preprocess_image(img_tensor) if self.params and self.params.controlnets: diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 45ad3e4ae..3cc531842 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -14,6 +14,8 @@ from log import config_logging, config_logging_fields, log_timing from trickle import InputFrame, AudioFrame, VideoFrame, OutputFrame, VideoOutput, AudioOutput +from streamdiffusion.image_utils import denormalize + class PipelineProcess: @staticmethod def start(pipeline_name: str, params: dict): @@ -96,8 +98,15 @@ def reset_stream(self, request_id: str, manifest_id: str, stream_id: str): # TODO: Once audio is implemented, combined send_input with input_loop # We don't need additional queueing as comfystream already maintains a queue def send_input(self, frame: InputFrame): - if isinstance(frame, VideoFrame) and not frame.tensor.is_cuda and torch.cuda.is_available(): - frame = frame.replace_tensor(frame.tensor.cuda()) + if isinstance(frame, VideoFrame): + img_tensor = frame.tensor + if not img_tensor.is_cuda and torch.cuda.is_available(): + img_tensor = img_tensor.cuda() + img_tensor = img_tensor.permute(0, 3, 1, 2) + img_tensor = denormalize(img_tensor) + # img_tensor = self.pipe.preprocess_image(img_tensor) + frame = frame.replace_tensor(img_tensor) + self._try_queue_put(self.input_queue, frame) async def recv_output(self) -> OutputFrame | None: From 48495ef8069076551b9d482e00c9d3823f4dd535 Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:27:29 +0000 Subject: [PATCH 3/7] Move all preprocessing to outer process (~50FPS) --- runner/app/live/pipelines/streamdiffusion.py | 2 +- runner/app/live/streamer/process.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py index d1297133f..6b66c815a 100644 --- a/runner/app/live/pipelines/streamdiffusion.py +++ b/runner/app/live/pipelines/streamdiffusion.py @@ -175,7 +175,7 @@ def process_tensor_sync(self, img_tensor: torch.Tensor): # VaeImageProcessor inside the wrapper expects (B, C, H, W) in [0, 1]. # img_tensor = img_tensor.permute(0, 3, 1, 2) # img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) - img_tensor = self.pipe.preprocess_image(img_tensor) + # img_tensor = self.pipe.preprocess_image(img_tensor) if self.params and self.params.controlnets: self.pipe.update_control_image_efficient(img_tensor) diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 3cc531842..32bafd5e2 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -14,7 +14,7 @@ from log import config_logging, config_logging_fields, log_timing from trickle import InputFrame, AudioFrame, VideoFrame, OutputFrame, VideoOutput, AudioOutput -from streamdiffusion.image_utils import denormalize +from diffusers.image_processor import VaeImageProcessor class PipelineProcess: @staticmethod @@ -42,6 +42,8 @@ def __init__(self, pipeline_name: str): self.start_time = 0.0 self.request_id = "" + self.image_processor = VaeImageProcessor() + def is_alive(self): return self.process.is_alive() @@ -103,8 +105,8 @@ def send_input(self, frame: InputFrame): if not img_tensor.is_cuda and torch.cuda.is_available(): img_tensor = img_tensor.cuda() img_tensor = img_tensor.permute(0, 3, 1, 2) - img_tensor = denormalize(img_tensor) - # img_tensor = self.pipe.preprocess_image(img_tensor) + img_tensor = self.image_processor.denormalize(img_tensor) + img_tensor = self.image_processor.preprocess(img_tensor) frame = frame.replace_tensor(img_tensor) self._try_queue_put(self.input_queue, frame) From 0c06d4d2c1145a58a939c586e3cdfa529303302b Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:27:51 +0000 Subject: [PATCH 4/7] Skip normalization back-and-forth (~50FPS) Actually won like 0.5FPS maybe but too subtle --- runner/app/live/streamer/process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 32bafd5e2..48e5ad84c 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -42,7 +42,7 @@ def __init__(self, pipeline_name: str): self.start_time = 0.0 self.request_id = "" - self.image_processor = VaeImageProcessor() + self.image_processor = VaeImageProcessor(do_normalize=False) def is_alive(self): return self.process.is_alive() @@ -105,7 +105,7 @@ def send_input(self, frame: InputFrame): if not img_tensor.is_cuda and torch.cuda.is_available(): img_tensor = img_tensor.cuda() img_tensor = img_tensor.permute(0, 3, 1, 2) - img_tensor = self.image_processor.denormalize(img_tensor) + # img_tensor = self.image_processor.denormalize(img_tensor) img_tensor = self.image_processor.preprocess(img_tensor) frame = frame.replace_tensor(img_tensor) From d9997b863ac73d60a67c21f46d8dc2b5b2e21f26 Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:45:06 +0000 Subject: [PATCH 5/7] Offload preprocessing to CPU (~51FPS) --- runner/app/live/streamer/process.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 48e5ad84c..9fd037db6 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -102,11 +102,13 @@ def reset_stream(self, request_id: str, manifest_id: str, stream_id: str): def send_input(self, frame: InputFrame): if isinstance(frame, VideoFrame): img_tensor = frame.tensor - if not img_tensor.is_cuda and torch.cuda.is_available(): - img_tensor = img_tensor.cuda() + if img_tensor.is_cuda: + img_tensor = img_tensor.cpu() img_tensor = img_tensor.permute(0, 3, 1, 2) # img_tensor = self.image_processor.denormalize(img_tensor) img_tensor = self.image_processor.preprocess(img_tensor) + if torch.cuda.is_available() and not img_tensor.is_cuda: + img_tensor = img_tensor.cuda() frame = frame.replace_tensor(img_tensor) self._try_queue_put(self.input_queue, frame) From 7baa4e63fe5a3a2d55b4e5da3c8cb9adee02d47b Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Fri, 1 Aug 2025 21:54:20 +0000 Subject: [PATCH 6/7] Revert to 4812d3f --- runner/app/live/pipelines/streamdiffusion.py | 2 +- runner/app/live/streamer/process.py | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py index 6b66c815a..d1297133f 100644 --- a/runner/app/live/pipelines/streamdiffusion.py +++ b/runner/app/live/pipelines/streamdiffusion.py @@ -175,7 +175,7 @@ def process_tensor_sync(self, img_tensor: torch.Tensor): # VaeImageProcessor inside the wrapper expects (B, C, H, W) in [0, 1]. # img_tensor = img_tensor.permute(0, 3, 1, 2) # img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) - # img_tensor = self.pipe.preprocess_image(img_tensor) + img_tensor = self.pipe.preprocess_image(img_tensor) if self.params and self.params.controlnets: self.pipe.update_control_image_efficient(img_tensor) diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 9fd037db6..3cc531842 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -14,7 +14,7 @@ from log import config_logging, config_logging_fields, log_timing from trickle import InputFrame, AudioFrame, VideoFrame, OutputFrame, VideoOutput, AudioOutput -from diffusers.image_processor import VaeImageProcessor +from streamdiffusion.image_utils import denormalize class PipelineProcess: @staticmethod @@ -42,8 +42,6 @@ def __init__(self, pipeline_name: str): self.start_time = 0.0 self.request_id = "" - self.image_processor = VaeImageProcessor(do_normalize=False) - def is_alive(self): return self.process.is_alive() @@ -102,13 +100,11 @@ def reset_stream(self, request_id: str, manifest_id: str, stream_id: str): def send_input(self, frame: InputFrame): if isinstance(frame, VideoFrame): img_tensor = frame.tensor - if img_tensor.is_cuda: - img_tensor = img_tensor.cpu() - img_tensor = img_tensor.permute(0, 3, 1, 2) - # img_tensor = self.image_processor.denormalize(img_tensor) - img_tensor = self.image_processor.preprocess(img_tensor) - if torch.cuda.is_available() and not img_tensor.is_cuda: + if not img_tensor.is_cuda and torch.cuda.is_available(): img_tensor = img_tensor.cuda() + img_tensor = img_tensor.permute(0, 3, 1, 2) + img_tensor = denormalize(img_tensor) + # img_tensor = self.pipe.preprocess_image(img_tensor) frame = frame.replace_tensor(img_tensor) self._try_queue_put(self.input_queue, frame) From ed73ecb85522565be140b36fb6759455da3bcb7b Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Mon, 4 Aug 2025 15:23:51 +0000 Subject: [PATCH 7/7] WIPSDXL --- runner/app/live/pipelines/streamdiffusion.py | 9 +++++---- runner/app/live/streamer/process.py | 10 +++++++--- .../tools/streamdiffusion/build_tensorrt_internal.sh | 2 +- runner/dl_checkpoints.sh | 7 ++++--- runner/docker/Dockerfile.live-base-streamdiffusion | 4 ++-- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py index d1297133f..b4dfe7077 100644 --- a/runner/app/live/pipelines/streamdiffusion.py +++ b/runner/app/live/pipelines/streamdiffusion.py @@ -6,13 +6,13 @@ import torch from pydantic import BaseModel, Field, model_validator from streamdiffusion import StreamDiffusionWrapper -from streamdiffusion.controlnet.preprocessors import list_preprocessors +# from streamdiffusion.controlnet.preprocessors import list_preprocessors from .interface import Pipeline from trickle import VideoFrame, VideoOutput from trickle import DEFAULT_WIDTH, DEFAULT_HEIGHT -AVAILABLE_PREPROCESSORS = list_preprocessors() +AVAILABLE_PREPROCESSORS = [] class ControlNetConfig(BaseModel): """ControlNet configuration model""" @@ -39,7 +39,8 @@ class Config: model_id: Literal[ "stabilityai/sd-turbo", "KBlueLeaf/kohaku-v2.1", - ] = "stabilityai/sd-turbo" + "stabilityai/sdxl-turbo", + ] = "stabilityai/sdxl-turbo" # Generation parameters prompt: str | List[Tuple[str, float]] = "an anime render of a girl with purple hair, masterpiece" @@ -175,7 +176,7 @@ def process_tensor_sync(self, img_tensor: torch.Tensor): # VaeImageProcessor inside the wrapper expects (B, C, H, W) in [0, 1]. # img_tensor = img_tensor.permute(0, 3, 1, 2) # img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor)) - img_tensor = self.pipe.preprocess_image(img_tensor) + # img_tensor = self.pipe.preprocess_image(img_tensor) if self.params and self.params.controlnets: self.pipe.update_control_image_efficient(img_tensor) diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py index 3cc531842..0c1ca467e 100644 --- a/runner/app/live/streamer/process.py +++ b/runner/app/live/streamer/process.py @@ -14,7 +14,7 @@ from log import config_logging, config_logging_fields, log_timing from trickle import InputFrame, AudioFrame, VideoFrame, OutputFrame, VideoOutput, AudioOutput -from streamdiffusion.image_utils import denormalize +from diffusers.image_processor import VaeImageProcessor class PipelineProcess: @staticmethod @@ -42,6 +42,8 @@ def __init__(self, pipeline_name: str): self.start_time = 0.0 self.request_id = "" + self.image_processor = VaeImageProcessor() + def is_alive(self): return self.process.is_alive() @@ -103,8 +105,8 @@ def send_input(self, frame: InputFrame): if not img_tensor.is_cuda and torch.cuda.is_available(): img_tensor = img_tensor.cuda() img_tensor = img_tensor.permute(0, 3, 1, 2) - img_tensor = denormalize(img_tensor) - # img_tensor = self.pipe.preprocess_image(img_tensor) + img_tensor = self.image_processor.denormalize(img_tensor) + # img_tensor = self.image_processor.preprocess(img_tensor) frame = frame.replace_tensor(img_tensor) self._try_queue_put(self.input_queue, frame) @@ -179,6 +181,7 @@ async def _initialize_pipeline(self): return pipeline except Exception as e: self._report_error(f"Error loading pipeline: {e}") + logging.exception(e) if not params: # Already tried loading with default params raise @@ -191,6 +194,7 @@ async def _initialize_pipeline(self): return pipeline except Exception as e: self._report_error(f"Error loading pipeline with default params: {e}") + logging.exception(e) raise async def _run_pipeline_loops(self): diff --git a/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh b/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh index 5773a979b..a0fc75372 100755 --- a/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh +++ b/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh @@ -10,7 +10,7 @@ set -e CONDA_PYTHON="/workspace/miniconda3/envs/comfystream/bin/python" MODELS="stabilityai/sd-turbo KBlueLeaf/kohaku-v2.1" TIMESTEPS="3 4" # This is basically the supported sizes for the t_index_list -DIMENSIONS="512x512" # Engines are now compiled for the 384-1024 range, but keep this in case it's useful in the future +DIMENSIONS="1024x1024" # Engines are now compiled for the 384-1024 range, but keep this in case it's useful in the future CONTROLNETS="" # Default empty, will be set from command line # Function to display help diff --git a/runner/dl_checkpoints.sh b/runner/dl_checkpoints.sh index b6eaec439..c421cac65 100755 --- a/runner/dl_checkpoints.sh +++ b/runner/dl_checkpoints.sh @@ -131,6 +131,7 @@ function download_streamdiffusion_live_models() { # StreamDiffusion huggingface-cli download KBlueLeaf/kohaku-v2.1 --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models huggingface-cli download stabilityai/sd-turbo --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models + huggingface-cli download stabilityai/sdxl-turbo --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models # ControlNet models huggingface-cli download thibaud/controlnet-sd21-openpose-diffusers --include "*.bin" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models @@ -213,9 +214,9 @@ function build_streamdiffusion_tensorrt() { docker run --rm -v ./models:/models --gpus all -l TensorRT-engines $AI_RUNNER_STREAMDIFFUSION_IMAGE \ bash -c "./app/tools/streamdiffusion/build_tensorrt_internal.sh \ - --models 'stabilityai/sd-turbo KBlueLeaf/kohaku-v2.1' \ - --timesteps '1 2 3 4' \ - --controlnets 'thibaud/controlnet-sd21-openpose-diffusers thibaud/controlnet-sd21-hed-diffusers thibaud/controlnet-sd21-canny-diffusers thibaud/controlnet-sd21-depth-diffusers thibaud/controlnet-sd21-color-diffusers' \ + --models 'stabilityai/sdxl-turbo' \ + --timesteps '1 2 3' \ + --controlnets '' \ --build-depth-anything \ --build-pose \ && \ diff --git a/runner/docker/Dockerfile.live-base-streamdiffusion b/runner/docker/Dockerfile.live-base-streamdiffusion index 83cfc86e4..619e2e7f3 100644 --- a/runner/docker/Dockerfile.live-base-streamdiffusion +++ b/runner/docker/Dockerfile.live-base-streamdiffusion @@ -29,8 +29,8 @@ RUN conda run -n comfystream pip install --no-cache-dir --force-reinstall \ conda run -n comfystream pip install --no-cache-dir \ xformers==0.0.30 --no-deps -# Install StreamDiffusion @ v0.0.1-cnet.4 into the comfystream environment -RUN conda run -n comfystream pip install git+https://github.com/livepeer/StreamDiffusion.git@v0.0.1-cnet.4#egg=streamdiffusion[tensorrt] +# Install StreamDiffusion @ 902036d into the comfystream environment +RUN conda run -n comfystream pip install git+https://github.com/livepeer/StreamDiffusion.git@902036df74ad4a63b2d179a0e6dcc955e6f54c98#egg=streamdiffusion[tensorrt] # Pin versions of ONNX runtime which are too loose on streamdiffusion setup.py RUN conda run -n comfystream pip install --no-cache-dir \