z-image-generation/image_generation.py at master · KatherLab/z-image-generation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import argparse
import torch
from diffusers import ZImagePipeline


def main():
    parser = argparse.ArgumentParser(description="Generate images using Z-Image-Turbo")
    parser.add_argument(
        "--prompt",
        "-p",
        type=str,
        default="Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights.",
        help="Text prompt for image generation"
    )
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        default="/tim/data/z-image-turbo/output.png",
        help="Output filename (default: output.png)"
    )
    parser.add_argument(
        "--width",
        "-w",
        type=int,
        default=1024,
        help="Image width (default: 1024)"
    )
    parser.add_argument(
        "--height",
        type=int,
        default=1024,
        help="Image height (default: 1024)"
    )
    parser.add_argument(
        "--steps",
        "-s",
        type=int,
        default=9,
        help="Number of inference steps (default: 9)"
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for reproducibility (default: 42)"
    )

    args = parser.parse_args()

    # 1. Load the pipeline
    # Use bfloat16 for optimal performance on supported GPUs
    pipe = ZImagePipeline.from_pretrained(
        "Tongyi-MAI/Z-Image-Turbo",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=False,
    )
    pipe.to("cuda")

    # [Optional] Attention Backend
    # Diffusers uses SDPA by default. Switch to Flash Attention for better efficiency if supported:
    # pipe.transformer.set_attention_backend("flash")    # Enable Flash-Attention-2
    # pipe.transformer.set_attention_backend("_flash_3") # Enable Flash-Attention-3

    # [Optional] Model Compilation
    # Compiling the DiT model accelerates inference, but the first run will take longer to compile.
    # pipe.transformer.compile()

    # [Optional] CPU Offloading
    # Enable CPU offloading for memory-constrained devices.
    # pipe.enable_model_cpu_offload()

    # 2. Generate Image
    print(f"Generating image with prompt: {args.prompt}")
    image = pipe(
        prompt=args.prompt,
        height=args.height,
        width=args.width,
        num_inference_steps=args.steps,
        guidance_scale=0.0,  # Guidance should be 0 for the Turbo models
        generator=torch.Generator("cuda").manual_seed(args.seed),
    ).images[0]

    image.save(args.output)
    print(f"Image saved to: {args.output}")


if __name__ == "__main__":
    main()