modelscope · continue-revolution · Sep 10, 2025 · Sep 11, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/diffsynth_engine/conf/models/ace/dit.json b/diffsynth_engine/conf/models/ace/dit.json
@@ -0,0 +1,17 @@
+{
+  "head_dim": 128,
+  "lyric_encoder_vocab_size": 6693,
+  "lyric_hidden_size": 1024,
+  "max_position": 32768,
+  "mlp_ratio": 2.5,
+  "num_heads": 20,
+  "num_layers": 24,
+  "out_channels": 8,
+  "patch_size": [
+    16,
+    1
+  ],
+  "rope_theta": 1000000.0,
+  "speaker_embedding_dim": 512,
+  "text_embedding_dim": 768
+}
diff --git a/diffsynth_engine/conf/models/ace/t5.json b/diffsynth_engine/conf/models/ace/t5.json
@@ -0,0 +1,7 @@
+{
+  "d_ff": 2048,
+  "embed_dim": 768,
+  "num_heads": 12,
+  "num_encoder_layers": 12,
+  "vocab_size": 256384
+}
diff --git a/diffsynth_engine/conf/models/ace/vae/dcae.json b/diffsynth_engine/conf/models/ace/vae/dcae.json
@@ -0,0 +1,64 @@
+{
+    "attention_head_dim": 32,
+    "decoder_act_fns": "silu",
+    "decoder_block_out_channels": [
+        128,
+        256,
+        512,
+        1024
+    ],
+    "decoder_block_types": [
+        "ResBlock",
+        "ResBlock",
+        "ResBlock",
+        "EfficientViTBlock"
+    ],
+    "decoder_layers_per_block": [
+        3,
+        3,
+        3,
+        3
+    ],
+    "decoder_norm_types": "rms_norm",
+    "decoder_qkv_multiscales": [
+        [],
+        [],
+        [
+            5
+        ],
+        [
+            5
+        ]
+    ],
+    "encoder_block_out_channels": [
+        128,
+        256,
+        512,
+        1024
+    ],
+    "encoder_block_types": [
+        "ResBlock",
+        "ResBlock",
+        "ResBlock",
+        "EfficientViTBlock"
+    ],
+    "encoder_layers_per_block": [
+        2,
+        2,
+        3,
+        3
+    ],
+    "encoder_qkv_multiscales": [
+        [],
+        [],
+        [
+            5
+        ],
+        [
+            5
+        ]
+    ],
+    "in_channels": 2,
+    "latent_channels": 8,
+    "upsample_block_type": "interpolate"
+}
diff --git a/diffsynth_engine/conf/models/ace/vae/vocoder.json b/diffsynth_engine/conf/models/ace/vae/vocoder.json
@@ -0,0 +1,76 @@
+{
+    "depths": [
+        3,
+        3,
+        9,
+        3
+    ],
+    "dims": [
+        128,
+        256,
+        384,
+        512
+    ],
+    "f_max": 16000,
+    "f_min": 40,
+    "hop_length": 512,
+    "input_channels": 128,
+    "kernel_sizes": [
+        7
+    ],
+    "n_fft": 2048,
+    "n_mels": 128,
+    "num_mels": 512,
+    "post_conv_kernel_size": 13,
+    "pre_conv_kernel_size": 13,
+    "resblock_dilation_sizes": [
+        [
+            1,
+            3,
+            5
+        ],
+        [
+            1,
+            3,
+            5
+        ],
+        [
+            1,
+            3,
+            5
+        ],
+        [
+            1,
+            3,
+            5
+        ]
+    ],
+    "resblock_kernel_sizes": [
+        3,
+        7,
+        11,
+        13
+    ],
+    "sampling_rate": 44100,
+    "upsample_initial_channel": 1024,
+    "upsample_kernel_sizes": [
+        8,
+        8,
+        4,
+        4,
+        4,
+        4,
+        4
+    ],
+    "upsample_rates": [
+        4,
+        4,
+        2,
+        2,
+        2,
+        2,
+        2
+    ],
+    "use_template": false,
+    "win_length": 2048
+}
diff --git a/diffsynth_engine/configs/__init__.py b/diffsynth_engine/configs/__init__.py
@@ -10,12 +10,14 @@
     WanSpeech2VideoPipelineConfig,
     QwenImagePipelineConfig,
     HunyuanPipelineConfig,
+    ACEStepPipelineConfig,
     BaseStateDicts,
     SDStateDicts,
     SDXLStateDicts,
     FluxStateDicts,
     WanStateDicts,
     WanS2VStateDicts,
+    ACEStateDicts,
     QwenImageStateDicts,
 )
 from .controlnet import ControlType, ControlNetParams
@@ -32,12 +34,14 @@
     "WanSpeech2VideoPipelineConfig",
     "QwenImagePipelineConfig",
     "HunyuanPipelineConfig",
+    "ACEStepPipelineConfig",
     "BaseStateDicts",
     "SDStateDicts",
     "SDXLStateDicts",
     "FluxStateDicts",
     "WanStateDicts",
     "WanS2VStateDicts",
+    "ACEStateDicts",
     "QwenImageStateDicts",
     "ControlType",
     "ControlNetParams",

diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -266,6 +266,22 @@ class HunyuanPipelineConfig(BaseConfig):
     image_encoder_dtype: torch.dtype = torch.float16
 
 
+@dataclass
+class ACEStepPipelineConfig(AttentionConfig, OptimizationConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    model_dtype: torch.dtype = torch.bfloat16
+    dcae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vocoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_dtype: torch.dtype = torch.bfloat16
+    t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    t5_dtype: torch.dtype = torch.bfloat16
+
+    # default params set by model type
+    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor
+    cfg_scale: Optional[float | Tuple[float, float]] = field(default=None, init=False)  # default CFG scale
+    num_inference_steps: Optional[int] = field(default=None, init=False)  # default inference steps
+
+
 @dataclass
 class BaseStateDicts:
     pass
@@ -310,6 +326,14 @@ class WanS2VStateDicts:
     audio_encoder: Dict[str, torch.Tensor]
 
 
+@dataclass
+class ACEStateDicts:
+    model: Dict[str, torch.Tensor]
+    t5: Dict[str, torch.Tensor]
+    dcae: Dict[str, torch.Tensor]
+    vocoder: Dict[str, torch.Tensor]
+
+
 @dataclass
 class QwenImageStateDicts:
     model: Dict[str, torch.Tensor]