livepeer · victorges · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/runner/app/live/pipelines/streamdiffusion.py b/runner/app/live/pipelines/streamdiffusion.py
@@ -6,13 +6,13 @@
 import torch
 from pydantic import BaseModel, Field, model_validator
 from streamdiffusion import StreamDiffusionWrapper
-from streamdiffusion.controlnet.preprocessors import list_preprocessors
+# from streamdiffusion.controlnet.preprocessors import list_preprocessors
 
 from .interface import Pipeline
 from trickle import VideoFrame, VideoOutput
 from trickle import DEFAULT_WIDTH, DEFAULT_HEIGHT
 
-AVAILABLE_PREPROCESSORS = list_preprocessors()
+AVAILABLE_PREPROCESSORS = []
 
 class ControlNetConfig(BaseModel):
     """ControlNet configuration model"""
@@ -39,7 +39,8 @@ class Config:
     model_id: Literal[
         "stabilityai/sd-turbo",
         "KBlueLeaf/kohaku-v2.1",
-    ] = "stabilityai/sd-turbo"
+        "stabilityai/sdxl-turbo",
+    ] = "stabilityai/sdxl-turbo"
 
     # Generation parameters
     prompt: str | List[Tuple[str, float]] = "an anime render of a girl with purple hair, masterpiece"
@@ -77,54 +78,54 @@ class Config:
 
     # ControlNet settings
     controlnets: Optional[List[ControlNetConfig]] = [
-        ControlNetConfig(
-            model_id="thibaud/controlnet-sd21-openpose-diffusers",
-            conditioning_scale=0.711,
-            preprocessor="pose_tensorrt",
-            preprocessor_params={},
-            enabled=True,
-            control_guidance_start=0.0,
-            control_guidance_end=1.0,
-        ),
-        ControlNetConfig(
-            model_id="thibaud/controlnet-sd21-hed-diffusers",
-            conditioning_scale=0.2,
-            preprocessor="soft_edge",
-            preprocessor_params={},
-            enabled=True,
-            control_guidance_start=0.0,
-            control_guidance_end=1.0,
-        ),
-        ControlNetConfig(
-            model_id="thibaud/controlnet-sd21-canny-diffusers",
-            conditioning_scale=0.2,
-            preprocessor="canny",
-            preprocessor_params={
-                "low_threshold": 100,
-                "high_threshold": 200
-            },
-            enabled=True,
-            control_guidance_start=0.0,
-            control_guidance_end=1.0,
-        ),
-        ControlNetConfig(
-            model_id="thibaud/controlnet-sd21-depth-diffusers",
-            conditioning_scale=0.5,
-            preprocessor="depth_tensorrt",
-            preprocessor_params={},
-            enabled=True,
-            control_guidance_start=0.0,
-            control_guidance_end=1.0,
-        ),
-        ControlNetConfig(
-            model_id="thibaud/controlnet-sd21-color-diffusers",
-            conditioning_scale=0.2,
-            preprocessor="passthrough",
-            preprocessor_params={},
-            enabled=True,
-            control_guidance_start=0.0,
-            control_guidance_end=1.0,
-        )
+        # ControlNetConfig(
+        #     model_id="thibaud/controlnet-sd21-openpose-diffusers",
+        #     conditioning_scale=0.711,
+        #     preprocessor="pose_tensorrt",
+        #     preprocessor_params={},
+        #     enabled=True,
+        #     control_guidance_start=0.0,
+        #     control_guidance_end=1.0,
+        # ),
+        # ControlNetConfig(
+        #     model_id="thibaud/controlnet-sd21-hed-diffusers",
+        #     conditioning_scale=0.2,
+        #     preprocessor="soft_edge",
+        #     preprocessor_params={},
+        #     enabled=True,
+        #     control_guidance_start=0.0,
+        #     control_guidance_end=1.0,
+        # ),
+        # ControlNetConfig(
+        #     model_id="thibaud/controlnet-sd21-canny-diffusers",
+        #     conditioning_scale=0.2,
+        #     preprocessor="canny",
+        #     preprocessor_params={
+        #         "low_threshold": 100,
+        #         "high_threshold": 200
+        #     },
+        #     enabled=True,
+        #     control_guidance_start=0.0,
+        #     control_guidance_end=1.0,
+        # ),
+        # ControlNetConfig(
+        #     model_id="thibaud/controlnet-sd21-depth-diffusers",
+        #     conditioning_scale=0.5,
+        #     preprocessor="depth_tensorrt",
+        #     preprocessor_params={},
+        #     enabled=True,
+        #     control_guidance_start=0.0,
+        #     control_guidance_end=1.0,
+        # ),
+        # ControlNetConfig(
+        #     model_id="thibaud/controlnet-sd21-color-diffusers",
+        #     conditioning_scale=0.2,
+        #     preprocessor="passthrough",
+        #     preprocessor_params={},
+        #     enabled=True,
+        #     control_guidance_start=0.0,
+        #     control_guidance_end=1.0,
+        # )
     ]
 
     @model_validator(mode="after")
@@ -173,12 +174,12 @@ def process_tensor_sync(self, img_tensor: torch.Tensor):
 
         # The incoming frame.tensor is (B, H, W, C) in range [-1, 1] while the
         # VaeImageProcessor inside the wrapper expects (B, C, H, W) in [0, 1].
-        img_tensor = img_tensor.permute(0, 3, 1, 2)
-        img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor))
-        img_tensor = self.pipe.preprocess_image(img_tensor)
+        # img_tensor = img_tensor.permute(0, 3, 1, 2)
+        # img_tensor = cast(torch.Tensor, self.pipe.stream.image_processor.denormalize(img_tensor))
+        # img_tensor = self.pipe.preprocess_image(img_tensor)
 
-        # Noop if ControlNets are not enabled
-        self.pipe.update_control_image_efficient(img_tensor)
+        if self.params and self.params.controlnets:
+            self.pipe.update_control_image_efficient(img_tensor)
 
         if self.first_frame:
             self.first_frame = False

diff --git a/runner/app/live/streamer/process.py b/runner/app/live/streamer/process.py
@@ -14,6 +14,8 @@
 from log import config_logging, config_logging_fields, log_timing
 from trickle import InputFrame, AudioFrame, VideoFrame, OutputFrame, VideoOutput, AudioOutput
 
+from diffusers.image_processor import VaeImageProcessor
+
 class PipelineProcess:
     @staticmethod
     def start(pipeline_name: str, params: dict):
@@ -40,6 +42,8 @@ def __init__(self, pipeline_name: str):
         self.start_time = 0.0
         self.request_id = ""
 
+        self.image_processor = VaeImageProcessor()
+
     def is_alive(self):
         return self.process.is_alive()
 
@@ -96,8 +100,15 @@ def reset_stream(self, request_id: str, manifest_id: str, stream_id: str):
     # TODO: Once audio is implemented, combined send_input with input_loop
     # We don't need additional queueing as comfystream already maintains a queue
     def send_input(self, frame: InputFrame):
-        if isinstance(frame, VideoFrame) and not frame.tensor.is_cuda and torch.cuda.is_available():
-            frame = frame.replace_tensor(frame.tensor.cuda())
+        if isinstance(frame, VideoFrame):
+            img_tensor = frame.tensor
+            if not img_tensor.is_cuda and torch.cuda.is_available():
+                img_tensor = img_tensor.cuda()
+            img_tensor = img_tensor.permute(0, 3, 1, 2)
+            img_tensor = self.image_processor.denormalize(img_tensor)
+            # img_tensor = self.image_processor.preprocess(img_tensor)
+            frame = frame.replace_tensor(img_tensor)
+
         self._try_queue_put(self.input_queue, frame)
 
     async def recv_output(self) -> OutputFrame | None:
@@ -170,6 +181,7 @@ async def _initialize_pipeline(self):
                 return pipeline
         except Exception as e:
             self._report_error(f"Error loading pipeline: {e}")
+            logging.exception(e)
             if not params:
                 # Already tried loading with default params
                 raise
@@ -182,6 +194,7 @@ async def _initialize_pipeline(self):
                     return pipeline
             except Exception as e:
                 self._report_error(f"Error loading pipeline with default params: {e}")
+                logging.exception(e)
                 raise
 
     async def _run_pipeline_loops(self):

diff --git a/runner/app/live/trickle/decoder.py b/runner/app/live/trickle/decoder.py
@@ -9,7 +9,7 @@
 
 from .frame import InputFrame
 
-MAX_FRAMERATE=24
+MAX_FRAMERATE=120
 
 def decode_av(pipe_input, frame_callback, put_metadata, target_width, target_height):
     """

diff --git a/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh b/runner/app/tools/streamdiffusion/build_tensorrt_internal.sh
@@ -10,7 +10,7 @@ set -e
 CONDA_PYTHON="/workspace/miniconda3/envs/comfystream/bin/python"
 MODELS="stabilityai/sd-turbo KBlueLeaf/kohaku-v2.1"
 TIMESTEPS="3 4" # This is basically the supported sizes for the t_index_list
-DIMENSIONS="512x512" # Engines are now compiled for the 384-1024 range, but keep this in case it's useful in the future
+DIMENSIONS="1024x1024" # Engines are now compiled for the 384-1024 range, but keep this in case it's useful in the future
 CONTROLNETS="" # Default empty, will be set from command line
 
 # Function to display help

diff --git a/runner/dl_checkpoints.sh b/runner/dl_checkpoints.sh
@@ -131,6 +131,7 @@ function download_streamdiffusion_live_models() {
   # StreamDiffusion
   huggingface-cli download KBlueLeaf/kohaku-v2.1 --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models
   huggingface-cli download stabilityai/sd-turbo --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models
+  huggingface-cli download stabilityai/sdxl-turbo --include "*.safetensors" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models
 
   # ControlNet models
   huggingface-cli download thibaud/controlnet-sd21-openpose-diffusers --include "*.bin" "*.json" "*.txt" --exclude ".onnx" ".onnx_data" --cache-dir models
@@ -213,9 +214,9 @@ function build_streamdiffusion_tensorrt() {
 
   docker run --rm -v ./models:/models --gpus all -l TensorRT-engines $AI_RUNNER_STREAMDIFFUSION_IMAGE \
     bash -c "./app/tools/streamdiffusion/build_tensorrt_internal.sh \
-              --models 'stabilityai/sd-turbo KBlueLeaf/kohaku-v2.1' \
-              --timesteps '1 2 3 4' \
-              --controlnets 'thibaud/controlnet-sd21-openpose-diffusers thibaud/controlnet-sd21-hed-diffusers thibaud/controlnet-sd21-canny-diffusers thibaud/controlnet-sd21-depth-diffusers thibaud/controlnet-sd21-color-diffusers' \
+              --models 'stabilityai/sdxl-turbo' \
+              --timesteps '1 2 3' \
+              --controlnets '' \
               --build-depth-anything \
               --build-pose \
               && \

diff --git a/runner/docker/Dockerfile.live-base-streamdiffusion b/runner/docker/Dockerfile.live-base-streamdiffusion
@@ -29,8 +29,8 @@ RUN conda run -n comfystream pip install --no-cache-dir --force-reinstall \
     conda run -n comfystream pip install --no-cache-dir \
     xformers==0.0.30 --no-deps
 
-# Install StreamDiffusion @ v0.0.1-cnet.4 into the comfystream environment
-RUN conda run -n comfystream pip install git+https://github.com/livepeer/StreamDiffusion.git@v0.0.1-cnet.4#egg=streamdiffusion[tensorrt]
+# Install StreamDiffusion @ 902036d into the comfystream environment
+RUN conda run -n comfystream pip install git+https://github.com/livepeer/StreamDiffusion.git@902036df74ad4a63b2d179a0e6dcc955e6f54c98#egg=streamdiffusion[tensorrt]
 
 # Pin versions of ONNX runtime which are too loose on streamdiffusion setup.py
 RUN conda run -n comfystream pip install --no-cache-dir \