From 32aca9794cd6e7cf989c2c8962b7d98b1407b53b Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 20 Nov 2025 10:00:19 +1300 Subject: [PATCH 1/3] QwenText/QwenImage pipelines --- .../Enums/PipelineType.cs | 3 +- .../Models/TransformerQwenModel.cs | 62 +++ .../Pipelines/Qwen/QwenBase.cs | 451 ++++++++++++++++++ .../Pipelines/Qwen/QwenConfig.cs | 174 +++++++ .../Pipelines/Qwen/QwenPipeline.cs | 112 +++++ .../Pipelines/Qwen/QwenConfig.cs | 9 + .../Pipelines/Qwen/QwenPipeline.cs | 255 ++++++++++ 7 files changed, 1065 insertions(+), 1 deletion(-) create mode 100644 TensorStack.StableDiffusion/Models/TransformerQwenModel.cs create mode 100644 TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs create mode 100644 TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs create mode 100644 TensorStack.StableDiffusion/Pipelines/Qwen/QwenPipeline.cs create mode 100644 TensorStack.TextGeneration/Pipelines/Qwen/QwenConfig.cs create mode 100644 TensorStack.TextGeneration/Pipelines/Qwen/QwenPipeline.cs diff --git a/TensorStack.StableDiffusion/Enums/PipelineType.cs b/TensorStack.StableDiffusion/Enums/PipelineType.cs index c7aa346..1bd81a7 100644 --- a/TensorStack.StableDiffusion/Enums/PipelineType.cs +++ b/TensorStack.StableDiffusion/Enums/PipelineType.cs @@ -11,6 +11,7 @@ public enum PipelineType StableCascade = 10, LatentConsistency = 20, Flux = 30, - Nitro = 40 + Nitro = 40, + Qwen = 50 } } diff --git a/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs b/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs new file mode 100644 index 0000000..0f18ebe --- /dev/null +++ b/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs @@ -0,0 +1,62 @@ +// Copyright (c) TensorStack. All rights reserved. +// Licensed under the Apache 2.0 License. +using System.Threading; +using System.Threading.Tasks; +using TensorStack.Common; +using TensorStack.Common.Tensor; +using TensorStack.StableDiffusion.Config; + +namespace TensorStack.StableDiffusion.Models +{ + /// + /// TransformerModel: QwenImageTransformer2DModel + /// + public class TransformerQwenModel : TransformerModel + { + /// + /// Initializes a new instance of the class. + /// + /// The configuration. + public TransformerQwenModel(TransformerModelConfig configuration) + : base(configuration) { } + + + /// + /// Runs the Transformer model with the specified inputs + /// + /// The timestep. + /// The hidden states. + /// The encoder hidden states. + /// The image shapes. + /// The cancellation token that can be used by other objects or threads to receive notice of cancellation. + public async Task> RunAsync(int timestep, Tensor hiddenStates, Tensor encoderHiddenStates, Tensor imgShapes, CancellationToken cancellationToken = default) + { + if (!Transformer.IsLoaded()) + await Transformer.LoadAsync(cancellationToken: cancellationToken); + + var txtSequenceLength = encoderHiddenStates.Dimensions[1]; + var encoderHiddenStatesMask = new Tensor([1, txtSequenceLength]); + encoderHiddenStatesMask.Fill(1); + using (var transformerParams = new ModelParameters(Transformer.Metadata, cancellationToken)) + { + // Inputs + transformerParams.AddInput(hiddenStates); + transformerParams.AddScalarInput(timestep); + transformerParams.AddInput(encoderHiddenStatesMask); + transformerParams.AddInput(encoderHiddenStates); + transformerParams.AddInput(imgShapes); + transformerParams.AddScalarInput(txtSequenceLength); + + // Outputs + transformerParams.AddOutput(hiddenStates.Dimensions); + + // Inference + using (var results = await Transformer.RunInferenceAsync(transformerParams)) + { + return results[0].ToTensor(); + } + } + } + + } +} diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs new file mode 100644 index 0000000..fbb4e9d --- /dev/null +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs @@ -0,0 +1,451 @@ +// Copyright (c) TensorStack. All rights reserved. +// Licensed under the Apache 2.0 License. +using Microsoft.Extensions.Logging; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using TensorStack.Common; +using TensorStack.Common.Tensor; +using TensorStack.StableDiffusion.Common; +using TensorStack.StableDiffusion.Enums; +using TensorStack.StableDiffusion.Models; +using TensorStack.StableDiffusion.Schedulers; +using TensorStack.TextGeneration.Tokenizers; +using QwenTextConfig = TensorStack.TextGeneration.Pipelines.Qwen.QwenConfig; +using QwenTextPipeline = TensorStack.TextGeneration.Pipelines.Qwen.QwenPipeline; + +namespace TensorStack.StableDiffusion.Pipelines.Qwen +{ + public abstract class QwenBase : PipelineBase + { + /// + /// Initializes a new instance of the class. + /// + /// The transformer. + /// The text encoder. + /// The automatic encoder. + /// The logger. + public QwenBase(TransformerQwenModel transformer, QwenTextPipeline textEncoder, AutoEncoderModel autoEncoder, ILogger logger = default) : base(logger) + { + Transformer = transformer; + TextEncoder = textEncoder; + AutoEncoder = autoEncoder; + Initialize(); + Logger?.LogInformation("[QwenPipeline] Name: {Name}", Name); + } + + + /// + /// Initializes a new instance of the class. + /// + /// The configuration. + /// The logger. + public QwenBase(QwenConfig configuration, ILogger logger = default) : this( + new TransformerQwenModel(configuration.Transformer), + new QwenTextPipeline(new QwenTextConfig + { + OutputLastHiddenStates = true, + DecoderConfig = configuration.TextEncoder, + Tokenizer = new BPETokenizer(configuration.Tokenizer), + }), + new AutoEncoderModel(configuration.AutoEncoder), + logger) + { + Name = configuration.Name; + } + + + /// + /// Gets the type of the pipeline. + /// + public override PipelineType PipelineType => PipelineType.Qwen; + + /// + /// Gets the friendly name. + /// + public override string Name { get; init; } = nameof(PipelineType.Qwen); + + /// + /// Gets the TextEncoder. + /// + public QwenTextPipeline TextEncoder { get; init; } + + /// + /// Gets the transformer. + /// + public TransformerQwenModel Transformer { get; init; } + + /// + /// Gets the automatic encoder. + /// + public AutoEncoderModel AutoEncoder { get; init; } + + + /// + /// Loads the pipeline. + /// + /// The cancellation token. + public Task LoadAsync(CancellationToken cancellationToken = default) + { + // Qwen pipelines are lazy loaded on first run + return Task.CompletedTask; + } + + + /// + /// Unloads the pipeline. + /// + /// The cancellation token. + public async Task UnloadAsync(CancellationToken cancellationToken = default) + { + await Task.WhenAll + ( + Transformer.UnloadAsync(), + TextEncoder.UnloadAsync(cancellationToken), + AutoEncoder.EncoderUnloadAsync(), + AutoEncoder.DecoderUnloadAsync() + ); + Logger?.LogInformation("[{PipeLineType}] Pipeline Unloaded", PipelineType); + } + + + /// + /// Validates the options. + /// + /// The options. + protected override void ValidateOptions(GenerateOptions options) + { + base.ValidateOptions(options); + if (!Transformer.HasControlNet && options.HasControlNet) + throw new ArgumentException("Model does not support ControlNet"); + } + + + /// + /// Creates the prompt input embeddings. + /// + /// The options. + /// The cancellation token. + protected async Task CreatePromptAsync(IPipelineOptions options, CancellationToken cancellationToken = default) + { + var cachedPrompt = GetPromptCache(options); + if (cachedPrompt is not null) + return cachedPrompt; + + // Conditional Prompt + var promptEmbeds = await TextEncoder.GetLastHiddenState(new TextGeneration.Common.GenerateOptions + { + Seed = options.Seed, + Prompt = options.Prompt, + MinLength = 128, + MaxLength = 128 + }, cancellationToken); + + // Unconditional prompt + var negativePromptEmbeds = await TextEncoder.GetLastHiddenState(new TextGeneration.Common.GenerateOptions + { + Seed = options.Seed, + Prompt = options.NegativePrompt, + MinLength = 128, + MaxLength = 128 + }, cancellationToken); + + return SetPromptCache(options, new PromptResult(promptEmbeds, default, negativePromptEmbeds, default)); + } + + + /// + /// Decode the model latents to image + /// + /// The options. + /// The latents. + /// The cancellation token. + protected async Task DecodeLatentsAsync(IPipelineOptions options, Tensor latents, CancellationToken cancellationToken = default) + { + var timestamp = Logger.LogBegin(LogLevel.Debug, "[DecodeLatentsAsync] Begin AutoEncoder Decode"); + var decoderResult = await AutoEncoder.DecodeAsync(latents, cancellationToken: cancellationToken); + if (options.IsLowMemoryEnabled || options.IsLowMemoryDecoderEnabled) + await AutoEncoder.DecoderUnloadAsync(); + + Logger.LogEnd(LogLevel.Debug, timestamp, "[DecodeLatentsAsync] AutoEncoder Decode Complete"); + return decoderResult.AsImageTensor(); + } + + + /// + /// Encode the image to model latents + /// + /// The options. + /// The latents. + /// The cancellation token. + private async Task> EncodeLatentsAsync(IPipelineOptions options, CancellationToken cancellationToken = default) + { + var timestamp = Logger.LogBegin(LogLevel.Debug, "[EncodeLatentsAsync] Begin AutoEncoder Encode"); + var cacheResult = GetEncoderCache(options); + if (cacheResult is not null) + { + Logger.LogEnd(LogLevel.Debug, timestamp, "[EncodeLatentsAsync] AutoEncoder Encode Complete, Cached Result."); + return cacheResult; + } + + var inputTensor = options.InputImage.ResizeImage(options.Width, options.Height); + var encoderResult = await AutoEncoder.EncodeAsync(inputTensor, cancellationToken: cancellationToken); + if (options.IsLowMemoryEnabled || options.IsLowMemoryEncoderEnabled) + await AutoEncoder.EncoderUnloadAsync(); + + Logger.LogEnd(LogLevel.Debug, timestamp, "[EncodeLatentsAsync] AutoEncoder Encode Complete"); + return SetEncoderCache(options, encoderResult); + } + + + /// + /// Run Transformer model inference + /// + /// The options. + /// The prompt. + /// The progress callback. + /// The cancellation token. + protected async Task> RunInferenceAsync(IPipelineOptions options, IScheduler scheduler, PromptResult prompt, IProgress progressCallback = null, CancellationToken cancellationToken = default) + { + var timestamp = Logger.LogBegin(LogLevel.Debug, "[RunInferenceAsync] Begin Transformer Inference"); + + // Prompt + var isGuidanceEnabled = IsGuidanceEnabled(options); + var conditionalEmbeds = prompt.PromptEmbeds; + var unconditionalEmbeds = prompt.NegativePromptEmbeds; + + // Latents + var latents = await CreateLatentInputAsync(options, scheduler, cancellationToken); + + // Create ImgShapes + var imgShapes = new Tensor([1, 64, 64]); // TODO: + + // Load Model + await LoadTransformerAsync(options, progressCallback, cancellationToken); + + // Timesteps + var timesteps = scheduler.GetTimesteps(); + for (int i = 0; i < timesteps.Count; i++) + { + var timestep = timesteps[i]; + var steptime = Stopwatch.GetTimestamp(); + cancellationToken.ThrowIfCancellationRequested(); + + // Inputs. + var latentInput = scheduler.ScaleInput(timestep, latents); + + // Inference + var conditional = await Transformer.RunAsync + ( + timestep, + latentInput, + conditionalEmbeds, + imgShapes, + cancellationToken: cancellationToken + ); + + // Guidance + if (isGuidanceEnabled) + { + var unconditional = await Transformer.RunAsync + ( + timestep, + latentInput, + unconditionalEmbeds, + imgShapes, + cancellationToken: cancellationToken + ); + conditional = ApplyGuidance(conditional, unconditional, options.GuidanceScale); + } + + // Scheduler + var stepResult = scheduler.Step(timestep, conditional, latents); + + // Result + latents = stepResult.Sample; + + // Progress + if (scheduler.IsFinalOrder) + progressCallback.Notify(scheduler.CurrentStep, scheduler.TotalSteps, latents, steptime); + + Logger.LogEnd(LogLevel.Debug, steptime, $"[RunInferenceAsync] Step: {i + 1}/{timesteps.Count}"); + } + + // Unload + if (options.IsLowMemoryEnabled || options.IsLowMemoryComputeEnabled) + await Transformer.UnloadAsync(); + + Logger.LogEnd(LogLevel.Debug, timestamp, "[RunInferenceAsync] Transformer Inference Complete"); + return UnpackLatents(latents, options.Width, options.Height); + } + + + + /// + /// Create latent input. + /// + /// The options. + /// The scheduler. + /// The cancellation token. + private async Task> CreateLatentInputAsync(IPipelineOptions options, IScheduler scheduler, CancellationToken cancellationToken = default) + { + if (options.HasInputImage) + { + var timestep = scheduler.GetStartTimestep(); + var encoderResult = await EncodeLatentsAsync(options, cancellationToken); + var noiseTensor = scheduler.CreateRandomSample(encoderResult.Dimensions); + return PackLatents(scheduler.ScaleNoise(timestep, encoderResult, noiseTensor)); + } + + var height = options.Height * 2 / AutoEncoder.LatentChannels; + var width = options.Width * 2 / AutoEncoder.LatentChannels; + return PackLatents(scheduler.CreateRandomSample([1, AutoEncoder.LatentChannels, height, width])); + } + + + /// + /// Gets the model optimizations. + /// + /// The generate options. + /// The progress callback. + private ModelOptimization GetOptimizations(IPipelineOptions generateOptions, IProgress progressCallback = null) + { + var optimizations = new ModelOptimization(Optimization.None); + if (Transformer.HasOptimizationsChanged(optimizations)) + { + progressCallback.Notify("Optimizing Pipeline..."); + } + return optimizations; + } + + + /// + /// Determines whether classifier-free guidance is enabled + /// + /// The options. + private bool IsGuidanceEnabled(IPipelineOptions options) + { + return options.GuidanceScale > 1; + } + + + /// + /// Load Transformer with optimizations + /// + /// The options. + /// The progress callback. + /// The cancellation token. + private async Task LoadTransformerAsync(IPipelineOptions options, IProgress progressCallback = null, CancellationToken cancellationToken = default) + { + var optimizations = GetOptimizations(options, progressCallback); + return await Transformer.LoadAsync(optimizations, cancellationToken); + } + + /// + /// Packs the latents. + /// + /// The latents. + /// + protected Tensor PackLatents(Tensor latents) + { + var height = latents.Dimensions[2] / 2; + var width = latents.Dimensions[3] / 2; + latents = latents.Reshape([1, AutoEncoder.LatentChannels, height, 2, width, 2]); + latents = latents.Permute([0, 2, 4, 1, 3, 5]); + latents = latents.Reshape([1, height * width, AutoEncoder.LatentChannels * 4]); + return latents; + } + + + /// + /// Unpacks the latents. + /// + /// The latents. + /// The width. + /// The height. + /// + protected Tensor UnpackLatents(Tensor latents, int width, int height) + { + var channels = latents.Dimensions[2]; + height = height / AutoEncoder.LatentChannels; + width = width / AutoEncoder.LatentChannels; + latents = latents.Reshape([1, height, width, channels / 4, 2, 2]); + latents = latents.Permute([0, 3, 1, 4, 2, 5]); + latents = latents.Reshape([1, channels / (2 * 2), height * 2, width * 2]); + return latents; + } + + + /// + /// Checks the state of the pipeline. + /// + /// The options. + protected override async Task CheckPipelineState(IPipelineOptions options) + { + // Check Transformer/ControlNet status + if (options.HasControlNet && Transformer.IsLoaded()) + await Transformer.UnloadAsync(); + if (!options.HasControlNet && Transformer.IsControlNetLoaded()) + await Transformer.UnloadControlNetAsync(); + + // Check LowMemory status + if ((options.IsLowMemoryEnabled || options.IsLowMemoryTextEncoderEnabled)) // TODO + await TextEncoder.UnloadAsync(); + if ((options.IsLowMemoryEnabled || options.IsLowMemoryComputeEnabled) && Transformer.IsLoaded()) + await Transformer.UnloadAsync(); + if ((options.IsLowMemoryEnabled || options.IsLowMemoryComputeEnabled) && Transformer.IsControlNetLoaded()) + await Transformer.UnloadControlNetAsync(); + if ((options.IsLowMemoryEnabled || options.IsLowMemoryEncoderEnabled) && AutoEncoder.IsEncoderLoaded()) + await AutoEncoder.EncoderUnloadAsync(); + if ((options.IsLowMemoryEnabled || options.IsLowMemoryDecoderEnabled) && AutoEncoder.IsDecoderLoaded()) + await AutoEncoder.DecoderUnloadAsync(); + } + + + /// + /// Configures the supported schedulers. + /// + protected override IReadOnlyList ConfigureSchedulers() + { + return [SchedulerType.FlowMatchEulerDiscrete, SchedulerType.FlowMatchEulerDynamic]; + } + + + /// + /// Configures the default SchedulerOptions. + /// + protected override GenerateOptions ConfigureDefaultOptions() + { + var options = new GenerateOptions + { + Steps = 28, + Shift = 1f, + Width = 1024, + Height = 1024, + GuidanceScale = 3.5f, + Scheduler = SchedulerType.FlowMatchEulerDiscrete + }; + return options; + } + + + /// + /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources. + /// + private bool _disposed; + protected override void Dispose(bool disposing) + { + if (_disposed) + return; + if (disposing) + { + TextEncoder?.Dispose(); + Transformer?.Dispose(); + AutoEncoder?.Dispose(); + } + _disposed = true; + } + } +} diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs new file mode 100644 index 0000000..96afdb1 --- /dev/null +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs @@ -0,0 +1,174 @@ +// Copyright (c) TensorStack. All rights reserved. +// Licensed under the Apache 2.0 License. +using System; +using System.IO; +using System.Linq; +using TensorStack.Common; +using TensorStack.StableDiffusion.Config; +using TensorStack.StableDiffusion.Enums; +using TensorStack.TextGeneration.Common; +using TensorStack.TextGeneration.Tokenizers; + +namespace TensorStack.StableDiffusion.Pipelines.Qwen +{ + public record QwenConfig : PipelineConfig + { + /// + /// Initializes a new instance of the class. + /// + public QwenConfig() + { + Tokenizer = new TokenizerConfig + { + BOS = 151643, + EOS = 151645 + }; + TextEncoder = new DecoderConfig + { + NumHeads = 28, + NumLayers = 28, + NumKVHeads = 4, + HiddenSize = 3584, + VocabSize = 152064 + }; + Transformer = new TransformerModelConfig + { + InChannels = 64, + OutChannels = 16, + JointAttention = 3584, + PooledProjection = 768, + IsOptimizationSupported = true + }; + AutoEncoder = new AutoEncoderModelConfig + { + Scale = 16, + LatentChannels = 16, + ScaleFactor = 1 + }; + } + + public string Name { get; init; } = "Qwen"; + public override PipelineType Pipeline { get; } = PipelineType.Qwen; + public TokenizerConfig Tokenizer { get; init; } + public DecoderConfig TextEncoder { get; init; } + public TransformerModelConfig Transformer { get; init; } + public AutoEncoderModelConfig AutoEncoder { get; init; } + + + /// + /// Sets the execution provider for all models. + /// + /// The execution provider. + public override void SetProvider(ExecutionProvider executionProvider) + { + TextEncoder.SetProvider(executionProvider); + Transformer.SetProvider(executionProvider); + AutoEncoder.SetProvider(executionProvider); + } + + + /// + /// Saves the configuration to file. + /// + /// The configuration file. + /// if set to true use relative paths. + public override void Save(string configFile, bool useRelativePaths = true) + { + ConfigService.Serialize(configFile, this, useRelativePaths); + } + + + /// + /// Create Qwen configuration from default values + /// + /// The name. + /// Type of the model. + /// The execution provider. + /// QwenConfig. + public static QwenConfig FromDefault(string name, ModelType modelType, ExecutionProvider executionProvider = default) + { + var config = new QwenConfig { Name = name }; + config.Transformer.ModelType = modelType; + config.SetProvider(executionProvider); + return config; + } + + + /// + /// Create StableDiffusionv configuration from json file + /// + /// The configuration file. + /// The execution provider. + /// QwenConfig. + public static QwenConfig FromFile(string configFile, ExecutionProvider executionProvider = default) + { + var config = ConfigService.Deserialize(configFile); + config.SetProvider(executionProvider); + return config; + } + + + /// + /// Create Qwen configuration from folder structure + /// + /// The model folder. + /// Type of the model. + /// The execution provider. + public static QwenConfig FromFolder(string modelFolder, ModelType modelType, ExecutionProvider executionProvider = default) + { + return CreateFromFolder(modelFolder, default, modelType, executionProvider); + } + + + /// + /// Create Qwen configuration from folder structure + /// + /// The model folder. + /// The variant. + /// Type of the model. + /// The execution provider. + /// QwenConfig. + public static QwenConfig FromFolder(string modelFolder, string variant, ModelType modelType, ExecutionProvider executionProvider = default) + { + return CreateFromFolder(modelFolder, variant, modelType, executionProvider); + } + + + /// + /// Create Qwen configuration from folder structure + /// + /// The model folder. + /// The variant. + /// The execution provider. + /// QwenConfig. + public static QwenConfig FromFolder(string modelFolder, string variant, ExecutionProvider executionProvider = default) + { + string[] typeOptions = ["Turbo", "Distilled", "Dist"]; + var modelType = typeOptions.Any(v => variant.Contains(v, StringComparison.OrdinalIgnoreCase)) ? ModelType.Turbo : ModelType.Base; + return CreateFromFolder(modelFolder, variant, modelType, executionProvider); + } + + + /// + /// Create Qwen configuration from folder structure + /// + /// The model folder. + /// The variant. + /// Type of the model. + /// The execution provider. + /// QwenConfig. + private static QwenConfig CreateFromFolder(string modelFolder, string variant, ModelType modelType, ExecutionProvider executionProvider) + { + var config = FromDefault(Path.GetFileNameWithoutExtension(modelFolder), modelType, executionProvider); + config.Tokenizer.Path = Path.Combine(modelFolder, "tokenizer"); + config.TextEncoder.Path = GetVariantPath(modelFolder, "text_encoder", "model.onnx", variant); + config.Transformer.Path = GetVariantPath(modelFolder, "transformer", "model.onnx", variant); + config.AutoEncoder.DecoderModelPath = GetVariantPath(modelFolder, "vae_decoder", "model.onnx", variant); + config.AutoEncoder.EncoderModelPath = GetVariantPath(modelFolder, "vae_encoder", "model.onnx", variant); + var controlNetPath = GetVariantPath(modelFolder, "transformer", "controlnet.onnx", variant); + if (File.Exists(controlNetPath)) + config.Transformer.ControlNetPath = controlNetPath; + return config; + } + } +} diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenPipeline.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenPipeline.cs new file mode 100644 index 0000000..cfc3de4 --- /dev/null +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenPipeline.cs @@ -0,0 +1,112 @@ +// Copyright (c) TensorStack. All rights reserved. +// Licensed under the Apache 2.0 License. +using Microsoft.Extensions.Logging; +using System; +using System.Threading; +using System.Threading.Tasks; +using TensorStack.Common; +using TensorStack.Common.Pipeline; +using TensorStack.Common.Tensor; +using TensorStack.StableDiffusion.Common; +using TensorStack.StableDiffusion.Enums; +using TensorStack.StableDiffusion.Models; +using QwenTextPipeline = TensorStack.TextGeneration.Pipelines.Qwen.QwenPipeline; + +namespace TensorStack.StableDiffusion.Pipelines.Qwen +{ + public class QwenPipeline : QwenBase, IPipeline + { + /// + /// Initializes a new instance of the class. + /// + /// The transformer. + /// The text encoder. + /// The automatic encoder. + /// The logger. + public QwenPipeline(TransformerQwenModel transformer, QwenTextPipeline textEncoder, AutoEncoderModel autoEncoder, ILogger logger = null) + : base(transformer, textEncoder, autoEncoder, logger) { } + + /// + /// Initializes a new instance of the class. + /// + /// The configuration. + /// The logger. + public QwenPipeline(QwenConfig configuration, ILogger logger = null) + : base(configuration, logger) { } + + + /// + /// Run ImageTensor pipeline. + /// + /// The options. + /// The progress callback. + /// The cancellation token. + public async Task RunAsync(GenerateOptions options, IProgress progressCallback = null, CancellationToken cancellationToken = default) + { + ValidateOptions(options); + + var prompt = await CreatePromptAsync(options, cancellationToken); + using (var scheduler = CreateScheduler(options)) + { + var latents = await RunInferenceAsync(options, scheduler, prompt, progressCallback, cancellationToken); + return await DecodeLatentsAsync(options, latents, cancellationToken); + } + } + + + /// + /// Create Qwen pipeline from StableDiffusionConfig file + /// + /// The configuration file. + /// The execution provider. + /// The logger. + /// QwenPipeline. + public static QwenPipeline FromConfig(string configFile, ExecutionProvider executionProvider, ILogger logger = default) + { + return new QwenPipeline(QwenConfig.FromFile(configFile, executionProvider), logger); + } + + + /// + /// Create Qwen pipeline from folder structure + /// + /// The model folder. + /// Type of the model. + /// The execution provider. + /// The logger. + /// QwenPipeline. + public static QwenPipeline FromFolder(string modelFolder, ModelType modelType, ExecutionProvider executionProvider, ILogger logger = default) + { + return new QwenPipeline(QwenConfig.FromFolder(modelFolder, modelType, executionProvider), logger); + } + + + /// + /// Create Qwen pipeline from folder structure + /// + /// The model folder. + /// The variant. + /// Type of the model. + /// The execution provider. + /// The logger. + /// QwenPipeline. + public static QwenPipeline FromFolder(string modelFolder, string variant, ModelType modelType, ExecutionProvider executionProvider, ILogger logger = default) + { + return new QwenPipeline(QwenConfig.FromFolder(modelFolder, variant, modelType, executionProvider), logger); + } + + + /// + /// Create Qwen pipeline from folder structure + /// + /// The model folder. + /// The variant. + /// The execution provider. + /// The logger. + /// QwenPipeline. + public static QwenPipeline FromFolder(string modelFolder, string variant, ExecutionProvider executionProvider, ILogger logger = default) + { + return new QwenPipeline(QwenConfig.FromFolder(modelFolder, variant, executionProvider), logger); + } + } +} diff --git a/TensorStack.TextGeneration/Pipelines/Qwen/QwenConfig.cs b/TensorStack.TextGeneration/Pipelines/Qwen/QwenConfig.cs new file mode 100644 index 0000000..eee8524 --- /dev/null +++ b/TensorStack.TextGeneration/Pipelines/Qwen/QwenConfig.cs @@ -0,0 +1,9 @@ +using TensorStack.TextGeneration.Common; + +namespace TensorStack.TextGeneration.Pipelines.Qwen +{ + public record QwenConfig : TransformerConfig + { + public bool OutputLastHiddenStates { get; set; } + } +} diff --git a/TensorStack.TextGeneration/Pipelines/Qwen/QwenPipeline.cs b/TensorStack.TextGeneration/Pipelines/Qwen/QwenPipeline.cs new file mode 100644 index 0000000..e1964f7 --- /dev/null +++ b/TensorStack.TextGeneration/Pipelines/Qwen/QwenPipeline.cs @@ -0,0 +1,255 @@ +// Copyright (c) TensorStack. All rights reserved. +// Licensed under the Apache 2.0 License. + +using System; +using System.IO; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using TensorStack.Common; +using TensorStack.Common.Pipeline; +using TensorStack.Common.Tensor; +using TensorStack.TextGeneration.Cache; +using TensorStack.TextGeneration.Common; +using TensorStack.TextGeneration.Processing; +using TensorStack.TextGeneration.Tokenizers; + +namespace TensorStack.TextGeneration.Pipelines.Qwen +{ + public class QwenPipeline : DecoderPipeline, + IPipeline, + IPipeline + { + /// + /// Initializes a new instance of the class. + /// + /// The tokenizer configuration. + /// The decoder configuration. + public QwenPipeline(QwenConfig configuration) + : base(configuration.Tokenizer, configuration.DecoderConfig) + { + Configuration = configuration; + } + + public QwenConfig Configuration { get; } + + + /// + /// Runs the GreedySearch inference + /// + /// The options. + /// The cancellation token. + /// + public virtual async Task RunAsync(GenerateOptions options, IProgress progressCallback = null, CancellationToken cancellationToken = default) + { + await TokenizePromptAsync(options); + var sequence = await GreedySearchAsync(options, progressCallback, cancellationToken); + using (sequence) + { + return new GenerateResult + { + Score = sequence.Score, + Result = Tokenizer.Decode(sequence.Tokens), + Tokens = sequence.Tokens, + LastHiddenState = sequence.LastHiddenState + }; + } + } + + + /// + /// Runs the BeamSearch inference + /// + /// The options. + /// The progress callback. + /// The cancellation token that can be used by other objects or threads to receive notice of cancellation. + public async Task RunAsync(SearchOptions options, IProgress progressCallback = null, CancellationToken cancellationToken = default) + { + await TokenizePromptAsync(options); + + var sequences = await BeamSearchAsync(options, progressCallback, cancellationToken); + var results = new GenerateResult[sequences.Length]; + for (int beam = 0; beam < sequences.Length; beam++) + { + var sequence = sequences[beam]; + using (sequence) + { + results[beam] = new GenerateResult + { + Beam = beam, + Score = sequence.Score, + PenaltyScore = sequence.PenaltyScore, + Result = Tokenizer.Decode(sequence.Tokens), + Tokens = sequence.Tokens, + LastHiddenState = sequence.LastHiddenState + }; + } + } + return results; + } + + + /// + /// Gets the LastHiddenState. + /// + /// The options. + /// The cancellation token. + public async Task> GetLastHiddenState(GenerateOptions options, CancellationToken cancellationToken = default) + { + await TokenizePromptAsync(options); + using (var sequence = await InitializeAsync(options)) + { + return sequence.LastHiddenState; + } + } + + + /// + /// Tokenize the prompt + /// + /// The options. + /// A Task representing the asynchronous operation. + protected override async Task TokenizePromptAsync(GenerateOptions options) + { + var tokenizerResult = await Tokenizer.EncodeAsync(options.Prompt); + var inputIds = tokenizerResult.InputIds.Span.Pad(Tokenizer.EOS, options.MinLength); + var mask = tokenizerResult.Mask.Span.Pad(0, options.MinLength); + TokenizerOutput = new TokenizerResult(inputIds, mask); + } + + + /// + /// Gets the token processors. + /// + /// The options. + /// ITokenProcessor[]. + protected override ITokenProcessor[] GetTokenProcessors(GenerateOptions options) + { + return + [ + new EOSTokenProcessor(options.MinLength, Tokenizer.EOS), + new MaxLengthTokenProcessor(options.MaxLength) + ]; + } + + + /// + /// Initialize the Decoder cache + /// + /// The options. + /// A Task<Sequence> representing the asynchronous operation. + protected override async Task InitializeAsync(GenerateOptions options) + { + var modelMetadata = await Decoder.LoadAsync(); + var kvCache = new KVCacheDecoder(modelMetadata, DecoderConfig.NumHeads, DecoderConfig.NumLayers, DecoderConfig.HiddenSize, DecoderConfig.NumKVHeads, options.MaxLength); + var sequence = new Sequence(kvCache, Tokenizer.BOS); + sequence.Initialize(0); + + var position = TokenizerOutput.Length; + var inputIds = TokenizerOutput.InputIds; + var positionIds = GetPositionIds(modelMetadata, 0, position); + var attentionMask = new Tensor([1, position], 1); + RunDecoderInternal(modelMetadata, sequence, inputIds, positionIds, attentionMask, false); + return sequence; + } + + + /// + /// Run decoder model + /// + /// The sequence. + /// A Task<Tensor`1> representing the asynchronous operation. + protected override async Task> RunDecoderAsync(Sequence sequence) + { + var modelMetadata = await Decoder.LoadAsync(); + var position = TokenizerOutput.Length + sequence.Tokens.Count; + var inputIds = new Tensor([1, 1], sequence.Tokens[^1]); + var positionIds = GetPositionIds(modelMetadata, position); + var attentionMask = new Tensor([1, position], 1); + return RunDecoderInternal(modelMetadata, sequence, inputIds, positionIds, attentionMask, true); + } + + + /// + /// Runs the decoder + /// + /// The model metadata. + /// The sequence. + /// The input ids. + /// The position ids. + /// The attention mask. + /// if set to true [use branch cache]. + private Tensor RunDecoderInternal(ModelMetadata modelMetadata, Sequence sequence, Tensor inputIds, Tensor positionIds, Tensor attentionMask, bool useBranchCache) + { + using (var parameters = new ModelParameters(modelMetadata)) + { + // Inputs + parameters.AddInput(inputIds); + parameters.AddInput(attentionMask); + if (positionIds != null) + parameters.AddInput(positionIds); + + foreach (var pastKeyValue in sequence.Cache) + parameters.AddInput(pastKeyValue, false); + + // Outputs + foreach (var output in modelMetadata.Outputs) + parameters.AddOutput(); + + // Result + var modelResult = Decoder.RunInference(parameters); + using (var logitsResult = modelResult[0]) + { + var dimension = logitsResult.GetDimensions(); + var logits = logitsResult.ToTensor(dimension[1..]); + var lastHiddenState = Configuration.OutputLastHiddenStates ? modelResult[^1].ToTensor() : default; + var presentKeyValues = Configuration.OutputLastHiddenStates ? modelResult.ToArray()[1..^1] : modelResult.ToArray()[1..]; + sequence.UpdateCache(presentKeyValues, useBranchCache, lastHiddenState); + return logits; + } + } + } + + + /// + /// Creates the QwenPipeline + /// + /// The provider. + /// The model path. + /// The decoder model. + /// QwenPipeline. + public static QwenPipeline Create(ExecutionProvider provider, string modelPath, string model = "model.onnx") + { + // Qwen-2.5-VL + // https://huggingface.co/Qwen/Qwen-Image/blob/main/text_encoder/config.json + var numHeads = 28; + var numLayers = 28; + var hiddenSize = 3584; + var numKVHeads = 4; + var vocabSize = 152064; + var config = new QwenConfig + { + OutputLastHiddenStates = true, + Tokenizer = new BPETokenizer(new TokenizerConfig + { + BOS = 151643, + EOS = 151645, + Path = modelPath + }), + DecoderConfig = new DecoderConfig + { + Path = Path.Combine(modelPath, model), + VocabSize = vocabSize, + NumHeads = numHeads, + NumLayers = numLayers, + HiddenSize = hiddenSize, + NumKVHeads = numKVHeads + } + }; + + config.DecoderConfig.SetProvider(provider); + return new QwenPipeline(config); + } + + } +} \ No newline at end of file From 8ee1e354d04218497ff1125402a44fe790344eab Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 20 Nov 2025 12:45:38 +1300 Subject: [PATCH 2/3] Add AutoEncoder LatentsMean/LatentsStd normalization --- .../Config/AutoEncoderConfig.cs | 9 ++++- .../Models/AutoEncoderModel.cs | 34 ++++++++++++++++ .../Pipelines/Qwen/QwenBase.cs | 2 +- .../Pipelines/Qwen/QwenConfig.cs | 40 ++++++++++++++++++- 4 files changed, 82 insertions(+), 3 deletions(-) diff --git a/TensorStack.StableDiffusion/Config/AutoEncoderConfig.cs b/TensorStack.StableDiffusion/Config/AutoEncoderConfig.cs index e85251a..dc81919 100644 --- a/TensorStack.StableDiffusion/Config/AutoEncoderConfig.cs +++ b/TensorStack.StableDiffusion/Config/AutoEncoderConfig.cs @@ -1,5 +1,6 @@ // Copyright (c) TensorStack. All rights reserved. // Licensed under the Apache 2.0 License. +using System.Text.Json.Serialization; using TensorStack.Common; namespace TensorStack.StableDiffusion.Config @@ -14,5 +15,11 @@ public record AutoEncoderModelConfig : ModelConfig public int LatentChannels { get; set; } = 4; public string DecoderModelPath { get; set; } public string EncoderModelPath { get; set; } - } + + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float[] LatentsStd { get; set; } + + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float[] LatentsMean {get; set; } +} } diff --git a/TensorStack.StableDiffusion/Models/AutoEncoderModel.cs b/TensorStack.StableDiffusion/Models/AutoEncoderModel.cs index d50be16..1049b33 100644 --- a/TensorStack.StableDiffusion/Models/AutoEncoderModel.cs +++ b/TensorStack.StableDiffusion/Models/AutoEncoderModel.cs @@ -142,6 +142,7 @@ public virtual async Task> DecodeAsync(Tensor inputTensor, if (!disableShift) inputTensor.Add(ShiftFactor); + ApplyNormalization(inputTensor, Configuration.LatentsMean, Configuration.LatentsStd); var outputDimensions = new[] { 1, OutChannels, inputTensor.Dimensions[2] * Scale, inputTensor.Dimensions[3] * Scale }; using (var modelParameters = new ModelParameters(Decoder.Metadata, cancellationToken)) { @@ -195,6 +196,39 @@ public virtual async Task> EncodeAsync(ImageTensor inputTensor, bo } + /// + /// Applies per-channel normalization to a latent tensor in-place, equivalent to: + /// latents = latents / latentsStd + latentsMean + /// + /// The latents. + /// Per-channel mean values. Length must equal the number of channels in . + /// Per-channel standard deviation values. Length must equal the number of channels in . Each value is inverted (1 / std) before applying to the tensor. + private static void ApplyNormalization(Tensor latents, ReadOnlySpan latentsMean, ReadOnlySpan latentsStd) + { + if (latentsMean.IsEmpty || latentsStd.IsEmpty) + return; + + var dimensions = latents.Dimensions; + var channels = dimensions[1]; + + Span invStd = stackalloc float[channels]; + for (int c = 0; c < channels; c++) + invStd[c] = 1f / latentsStd[c]; + + var data = latents.Memory.Span; + var strideC = data.Length / channels; + + for (int c = 0; c < channels; c++) + { + var mean = latentsMean[c]; + var inv = invStd[c]; + var slice = data.Slice(c * strideC, strideC); + for (int i = 0; i < slice.Length; i++) + slice[i] = slice[i] * inv + mean; + } + } + + /// /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources. /// diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs index fbb4e9d..de78e18 100644 --- a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs @@ -165,7 +165,7 @@ protected async Task CreatePromptAsync(IPipelineOptions options, C protected async Task DecodeLatentsAsync(IPipelineOptions options, Tensor latents, CancellationToken cancellationToken = default) { var timestamp = Logger.LogBegin(LogLevel.Debug, "[DecodeLatentsAsync] Begin AutoEncoder Decode"); - var decoderResult = await AutoEncoder.DecodeAsync(latents, cancellationToken: cancellationToken); + var decoderResult = await AutoEncoder.DecodeAsync(latents, disableShift: true, disableScale: true, cancellationToken: cancellationToken); if (options.IsLowMemoryEnabled || options.IsLowMemoryDecoderEnabled) await AutoEncoder.DecoderUnloadAsync(); diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs index 96afdb1..4d1bff4 100644 --- a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenConfig.cs @@ -43,7 +43,45 @@ public QwenConfig() { Scale = 16, LatentChannels = 16, - ScaleFactor = 1 + ScaleFactor = 1, + LatentsMean = + [ + -0.7571f, + -0.7089f, + -0.9113f, + 0.1075f, + -0.1745f, + 0.9653f, + -0.1517f, + 1.5508f, + 0.4134f, + -0.0715f, + 0.5517f, + -0.3632f, + -0.1922f, + -0.9497f, + 0.2503f, + -0.2921f + ], + LatentsStd = + [ + 2.8184f, + 1.4541f, + 2.3275f, + 2.6558f, + 1.2196f, + 1.7708f, + 2.6052f, + 2.0743f, + 3.2687f, + 2.1526f, + 2.8652f, + 1.5579f, + 1.6382f, + 1.1253f, + 2.8251f, + 1.916f + ] }; } From 1d95332e1e9c2d6fd114bb1a4063589269c7f4df Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 20 Nov 2025 14:07:33 +1300 Subject: [PATCH 3/3] txt_seq_lens and encoder_hidden_states_mask inputs can be calculated in model --- .../Models/TransformerQwenModel.cs | 9 ++------- TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs b/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs index 0f18ebe..c88471f 100644 --- a/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs +++ b/TensorStack.StableDiffusion/Models/TransformerQwenModel.cs @@ -29,23 +29,18 @@ public TransformerQwenModel(TransformerModelConfig configuration) /// The encoder hidden states. /// The image shapes. /// The cancellation token that can be used by other objects or threads to receive notice of cancellation. - public async Task> RunAsync(int timestep, Tensor hiddenStates, Tensor encoderHiddenStates, Tensor imgShapes, CancellationToken cancellationToken = default) + public async Task> RunAsync(int timestep, Tensor hiddenStates, Tensor encoderHiddenStates, Tensor imgShapes, CancellationToken cancellationToken = default) { if (!Transformer.IsLoaded()) await Transformer.LoadAsync(cancellationToken: cancellationToken); - var txtSequenceLength = encoderHiddenStates.Dimensions[1]; - var encoderHiddenStatesMask = new Tensor([1, txtSequenceLength]); - encoderHiddenStatesMask.Fill(1); using (var transformerParams = new ModelParameters(Transformer.Metadata, cancellationToken)) { // Inputs transformerParams.AddInput(hiddenStates); - transformerParams.AddScalarInput(timestep); - transformerParams.AddInput(encoderHiddenStatesMask); transformerParams.AddInput(encoderHiddenStates); + transformerParams.AddScalarInput(timestep); transformerParams.AddInput(imgShapes); - transformerParams.AddScalarInput(txtSequenceLength); // Outputs transformerParams.AddOutput(hiddenStates.Dimensions); diff --git a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs index de78e18..78c1c34 100644 --- a/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs +++ b/TensorStack.StableDiffusion/Pipelines/Qwen/QwenBase.cs @@ -220,7 +220,7 @@ protected async Task> RunInferenceAsync(IPipelineOptions options, var latents = await CreateLatentInputAsync(options, scheduler, cancellationToken); // Create ImgShapes - var imgShapes = new Tensor([1, 64, 64]); // TODO: + var imgShapes = new Tensor([1, 64, 64]); // TODO: H/W // Load Model await LoadTransformerAsync(options, progressCallback, cancellationToken);