Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions diffsynth_engine/conf/models/ace/dit.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"head_dim": 128,
"lyric_encoder_vocab_size": 6693,
"lyric_hidden_size": 1024,
"max_position": 32768,
"mlp_ratio": 2.5,
"num_heads": 20,
"num_layers": 24,
"out_channels": 8,
"patch_size": [
16,
1
],
"rope_theta": 1000000.0,
"speaker_embedding_dim": 512,
"text_embedding_dim": 768
}
7 changes: 7 additions & 0 deletions diffsynth_engine/conf/models/ace/t5.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"d_ff": 2048,
"embed_dim": 768,
"num_heads": 12,
"num_encoder_layers": 12,
"vocab_size": 256384
}
64 changes: 64 additions & 0 deletions diffsynth_engine/conf/models/ace/vae/dcae.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"attention_head_dim": 32,
"decoder_act_fns": "silu",
"decoder_block_out_channels": [
128,
256,
512,
1024
],
"decoder_block_types": [
"ResBlock",
"ResBlock",
"ResBlock",
"EfficientViTBlock"
],
"decoder_layers_per_block": [
3,
3,
3,
3
],
"decoder_norm_types": "rms_norm",
"decoder_qkv_multiscales": [
[],
[],
[
5
],
[
5
]
],
"encoder_block_out_channels": [
128,
256,
512,
1024
],
"encoder_block_types": [
"ResBlock",
"ResBlock",
"ResBlock",
"EfficientViTBlock"
],
"encoder_layers_per_block": [
2,
2,
3,
3
],
"encoder_qkv_multiscales": [
[],
[],
[
5
],
[
5
]
],
"in_channels": 2,
"latent_channels": 8,
"upsample_block_type": "interpolate"
}
76 changes: 76 additions & 0 deletions diffsynth_engine/conf/models/ace/vae/vocoder.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"depths": [
3,
3,
9,
3
],
"dims": [
128,
256,
384,
512
],
"f_max": 16000,
"f_min": 40,
"hop_length": 512,
"input_channels": 128,
"kernel_sizes": [
7
],
"n_fft": 2048,
"n_mels": 128,
"num_mels": 512,
"post_conv_kernel_size": 13,
"pre_conv_kernel_size": 13,
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_kernel_sizes": [
3,
7,
11,
13
],
"sampling_rate": 44100,
"upsample_initial_channel": 1024,
"upsample_kernel_sizes": [
8,
8,
4,
4,
4,
4,
4
],
"upsample_rates": [
4,
4,
2,
2,
2,
2,
2
],
"use_template": false,
"win_length": 2048
}
4 changes: 4 additions & 0 deletions diffsynth_engine/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
WanSpeech2VideoPipelineConfig,
QwenImagePipelineConfig,
HunyuanPipelineConfig,
ACEStepPipelineConfig,
BaseStateDicts,
SDStateDicts,
SDXLStateDicts,
FluxStateDicts,
WanStateDicts,
WanS2VStateDicts,
ACEStateDicts,
QwenImageStateDicts,
)
from .controlnet import ControlType, ControlNetParams
Expand All @@ -32,12 +34,14 @@
"WanSpeech2VideoPipelineConfig",
"QwenImagePipelineConfig",
"HunyuanPipelineConfig",
"ACEStepPipelineConfig",
"BaseStateDicts",
"SDStateDicts",
"SDXLStateDicts",
"FluxStateDicts",
"WanStateDicts",
"WanS2VStateDicts",
"ACEStateDicts",
"QwenImageStateDicts",
"ControlType",
"ControlNetParams",
Expand Down
24 changes: 24 additions & 0 deletions diffsynth_engine/configs/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,22 @@ class HunyuanPipelineConfig(BaseConfig):
image_encoder_dtype: torch.dtype = torch.float16


@dataclass
class ACEStepPipelineConfig(AttentionConfig, OptimizationConfig, BaseConfig):
model_path: str | os.PathLike | List[str | os.PathLike]
model_dtype: torch.dtype = torch.bfloat16
dcae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
vocoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
vae_dtype: torch.dtype = torch.bfloat16
t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
t5_dtype: torch.dtype = torch.bfloat16

# default params set by model type
shift: Optional[float] = field(default=None, init=False) # RecifitedFlowScheduler shift factor
cfg_scale: Optional[float | Tuple[float, float]] = field(default=None, init=False) # default CFG scale
num_inference_steps: Optional[int] = field(default=None, init=False) # default inference steps


@dataclass
class BaseStateDicts:
pass
Expand Down Expand Up @@ -310,6 +326,14 @@ class WanS2VStateDicts:
audio_encoder: Dict[str, torch.Tensor]


@dataclass
class ACEStateDicts:
model: Dict[str, torch.Tensor]
t5: Dict[str, torch.Tensor]
dcae: Dict[str, torch.Tensor]
vocoder: Dict[str, torch.Tensor]


@dataclass
class QwenImageStateDicts:
model: Dict[str, torch.Tensor]
Expand Down
Loading