Skip to content

Commit bc20a9b

Browse files
authored
Initiate Wan2.2 support (#28)
* init * fix * refine
1 parent 72f6e1a commit bc20a9b

File tree

5 files changed

+994
-0
lines changed

5 files changed

+994
-0
lines changed

visual-ai/Wan2.2/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Docker setup
2+
3+
Build docker image:
4+
5+
```bash
6+
bash build.sh
7+
```
8+
9+
Run docker image:
10+
11+
```bash
12+
export DOCKER_IMAGE=llm-scaler-visualai:latest-wan2.2
13+
export CONTAINER_NAME=wan-2.2
14+
export MODEL_DIR=<your_model_dir>
15+
sudo docker run -itd \
16+
--privileged \
17+
--net=host \
18+
--device=/dev/dri \
19+
-e no_proxy=localhost,127.0.0.1 \
20+
--name=$CONTAINER_NAME \
21+
-v $MODEL_DIR:/llm/models/ \
22+
--shm-size="16g" \
23+
--entrypoint=/bin/bash \
24+
$DOCKER_IMAGE
25+
26+
docker exec -it wan-2.2 bash
27+
```
28+
29+
Run Wan 2.2 demo on Single B60 GPU:
30+
```bash
31+
python3 generate.py --task ti2v-5B --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/ --offload_model True --t5_cpu --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --convert_model_dtype --frame_num 101 --sample_steps 50
32+
```
33+
34+
Run Wan 2.2 demo on 2 * B60 GPUs:
35+
```bash
36+
torchrun --nproc_per_node=2 generate.py --task ti2v-5B --size 1280*704 --ckpt_dir /llm/models/Wan2.2-TI2V-5B/ --ulysses_size 2 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." --offload_model True --t5_cpu --convert_model_dtype --frame_num 101 --sample_steps 50
37+
```

visual-ai/Wan2.2/build.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
set -x
2+
3+
export HTTP_PROXY=<your_http_proxy>
4+
export HTTPS_PROXY=<your_https_proxy>
5+
6+
docker build -f ./docker/Dockerfile . -t llm-scaler-visualai:latest-wan2.1 --build-arg https_proxy=$HTTPS_PROXY --build-arg http_proxy=$HTTP_PROXY

visual-ai/Wan2.2/docker/Dockerfile

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# ======== Base Stage ========
5+
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base
6+
7+
ARG https_proxy
8+
ARG http_proxy
9+
10+
# Add Intel oneAPI repo and PPA for GPU support
11+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
12+
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
13+
add-apt-repository -y ppa:kobuk-team/intel-graphics-testing
14+
15+
# Install dependencies and Python 3.10
16+
RUN apt-get update -y && \
17+
apt-get install -y software-properties-common && \
18+
add-apt-repository ppa:deadsnakes/ppa && \
19+
apt-get update -y && \
20+
apt-get install -y python3.10 python3.10-distutils python3.10-dev && \
21+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
22+
apt-get install -y --no-install-recommends --fix-missing \
23+
curl \
24+
ffmpeg \
25+
git \
26+
libsndfile1 \
27+
libsm6 \
28+
libxext6 \
29+
libgl1 \
30+
lsb-release \
31+
numactl \
32+
wget \
33+
vim \
34+
linux-libc-dev && \
35+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
36+
# Install Intel GPU runtime packages
37+
apt-get update -y && \
38+
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
39+
apt-get install -y intel-oneapi-dpcpp-ct=2025.0.1-17 && \
40+
apt-get clean && rm -rf /var/lib/apt/lists/*
41+
42+
# pin compute runtime version
43+
RUN mkdir /tmp/neo && \
44+
cd /tmp/neo && \
45+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-core-2_2.12.5+19302_amd64.deb && \
46+
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.12.5/intel-igc-opencl-2_2.12.5+19302_amd64.deb && \
47+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc-dbgsym_25.22.33944.8-0_amd64.ddeb && \
48+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-ocloc_25.22.33944.8-0_amd64.deb && \
49+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd-dbgsym_25.22.33944.8-0_amd64.ddeb && \
50+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/intel-opencl-icd_25.22.33944.8-0_amd64.deb && \
51+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libigdgmm12_22.7.0_amd64.deb && \
52+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1-dbgsym_25.22.33944.8-0_amd64.ddeb && \
53+
wget https://github.com/intel/compute-runtime/releases/download/25.22.33944.8/libze-intel-gpu1_25.22.33944.8-0_amd64.deb && \
54+
dpkg -i *.deb
55+
56+
WORKDIR /llm
57+
COPY ./patches/wan22_for_multi_arc.patch /tmp/
58+
COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/
59+
60+
# Set environment variables early
61+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
62+
63+
# ======= Add oneCCL build =======
64+
RUN apt-get update && apt-get install -y \
65+
cmake \
66+
g++ \
67+
&& rm -rf /var/lib/apt/lists/*
68+
69+
# Build 1ccl
70+
RUN git clone https://github.com/oneapi-src/oneCCL.git && \
71+
cd oneCCL && \
72+
git checkout def870543749186b6f38cdc865b44d52174c7492 && \
73+
git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
74+
mkdir build && cd build && \
75+
export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
76+
/usr/bin/cmake .. \
77+
-DCMAKE_INSTALL_PREFIX=_install \
78+
-DCMAKE_C_COMPILER=icx \
79+
-DCMAKE_CXX_COMPILER=icpx \
80+
-DCOMPUTE_BACKEND=dpcpp \
81+
-DCCL_ENABLE_ARCB=1 && \
82+
make -j && make install && \
83+
mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
84+
cd /opt/intel/oneapi/ccl/ && \
85+
ln -snf 2021.15.3 latest
86+
87+
# Configure environment to source oneAPI
88+
RUN echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
89+
90+
SHELL ["bash", "-c"]
91+
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
92+
93+
ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.10/dist-packages/torch/lib:$LD_LIBRARY_PATH"
94+
95+
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu && \
96+
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
97+
pip install bigdl-core-xe-all==2.6.0 --extra-index-url https://download.pytorch.org/whl/xpu && \
98+
apt remove python3-blinker -y
99+
100+
RUN cd /llm && \
101+
git clone https://github.com/Wan-Video/Wan2.2.git && \
102+
cd ./Wan2.2 && \
103+
git checkout 031a9be56cec91e86d140d3d3a74280fb05a9b1c && \
104+
git apply /tmp/wan22_for_multi_arc.patch && \
105+
pip install -r requirements.txt && \
106+
pip install einops && \
107+
pip install cffi
108+
109+
WORKDIR /llm/Wan2.2
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
2+
From: YongZhuIntel <yong.zhu@intel.com>
3+
Date: Wed, 21 May 2025 09:37:06 +0800
4+
Subject: [PATCH] oneccl align global V0.1.1
5+
6+
base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)
7+
8+
Build:
9+
1. mkdir build; cd build
10+
2. source /opt/intel/oneapi/setvars.sh
11+
3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
12+
4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install
13+
14+
print bandwidth in benchmark
15+
---
16+
examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
17+
examples/benchmark/src/benchmark.cpp | 7 +++--
18+
2 files changed, 41 insertions(+), 6 deletions(-)
19+
20+
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
21+
index 08a3625..bff6275 100644
22+
--- a/examples/benchmark/include/benchmark.hpp
23+
+++ b/examples/benchmark/include/benchmark.hpp
24+
@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
25+
double max_time,
26+
double avg_time,
27+
double stddev,
28+
- double wait_avg_time) {
29+
+ double wait_avg_time,
30+
+ double algbw,
31+
+ double busbw) {
32+
std::ofstream csvf;
33+
csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
34+
35+
@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
36+
<< "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
37+
<< ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
38+
<< iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
39+
- << stddev << "," << wait_avg_time << std::endl;
40+
+ << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
41+
}
42+
csvf.close();
43+
}
44+
@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
45+
max_time /= iter_count;
46+
47+
size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
48+
+
49+
+ double algbw = bytes*1000/total_avg_time/1024/1024;
50+
+
51+
+ if (ncolls == 1) {
52+
+ if (options.coll_names.front() == "allgather" ||
53+
+ options.coll_names.front() == "allgatherv" ||
54+
+ options.coll_names.front() == "reducescatter" ||
55+
+ options.coll_names.front() == "alltoall" ||
56+
+ options.coll_names.front() == "alltoallv") {
57+
+ algbw = algbw * nranks;
58+
+ }
59+
+ }
60+
+
61+
+ double busbw = algbw;
62+
+ if (ncolls == 1) {
63+
+ if (options.coll_names.front() == "allreduce") {
64+
+ busbw = algbw * 2 * (nranks -1) / nranks;
65+
+ } else if (options.coll_names.front() == "allgather" ||
66+
+ options.coll_names.front() == "allgatherv" ||
67+
+ options.coll_names.front() == "reducescatter" ||
68+
+ options.coll_names.front() == "alltoall" ||
69+
+ options.coll_names.front() == "alltoallv") {
70+
+ busbw = algbw * (nranks -1) / nranks;
71+
+ }
72+
+ }
73+
+
74+
std::stringstream ss;
75+
ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
76+
<< elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
77+
<< std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
78+
<< std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
79+
<< std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
80+
- << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
81+
+ << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
82+
+ << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
83+
+ << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);
84+
85+
if (show_extened_info(options.show_additional_info)) {
86+
ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
87+
@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
88+
max_time,
89+
total_avg_time,
90+
stddev,
91+
- wait_avg_time);
92+
+ wait_avg_time,
93+
+ algbw,
94+
+ busbw);
95+
}
96+
}
97+
98+
diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
99+
index d90fb9b..78957f2 100644
100+
--- a/examples/benchmark/src/benchmark.cpp
101+
+++ b/examples/benchmark/src/benchmark.cpp
102+
@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
103+
<< "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
104+
<< std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
105+
<< std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
106+
- << "stddev[%]";
107+
+ << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
108+
+ << "busbw[GB/s]";
109+
110+
if (show_extened_info(options.show_additional_info)) {
111+
ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
112+
@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
113+
<< "t_max[usec],"
114+
<< "t_avg[usec],"
115+
<< "stddev[%],"
116+
- << "wait_t_avg[usec]" << std::endl;
117+
+ << "wait_t_avg[usec],"
118+
+ << "algbw[GB/s],"
119+
+ << "busbw[GB/s]" << std::endl;
120+
csvf.close();
121+
}
122+
123+
--
124+
2.25.1
125+

0 commit comments

Comments
 (0)