Skip to content

Commit 2f62bc2

Browse files
committed
feat: initial commit with Dockerfile, patches, and tools
- Add Dockerfile for containerized environment setup - Include initial patches directory for source modifications - Add tools for build, test, or deployment workflows
1 parent 49d609c commit 2f62bc2

File tree

6 files changed

+6286
-0
lines changed

6 files changed

+6286
-0
lines changed

vllm/docker/Dockerfile

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# ======== Base Stage ========
5+
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base
6+
7+
ARG https_proxy
8+
ARG http_proxy
9+
10+
# Add Intel oneAPI repo and PPA for GPU support
11+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
12+
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
13+
add-apt-repository -y ppa:kobuk-team/intel-graphics-testing
14+
15+
# Install dependencies and Python 3.10
16+
RUN apt-get update -y && \
17+
apt-get install -y software-properties-common && \
18+
add-apt-repository ppa:deadsnakes/ppa && \
19+
apt-get update -y && \
20+
apt-get install -y python3.10 python3.10-distutils && \
21+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
22+
apt-get install -y --no-install-recommends --fix-missing \
23+
curl \
24+
ffmpeg \
25+
git \
26+
libsndfile1 \
27+
libsm6 \
28+
libxext6 \
29+
libgl1 \
30+
lsb-release \
31+
numactl \
32+
wget \
33+
vim \
34+
linux-libc-dev && \
35+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
36+
# Install Intel GPU runtime packages
37+
apt-get update -y && \
38+
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
39+
apt-get clean && rm -rf /var/lib/apt/lists/*
40+
41+
WORKDIR /llm
42+
COPY ./patches/vllm_for_multi_arc.patch /tmp/
43+
COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/
44+
45+
# Set environment variables early
46+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
47+
ENV VLLM_TARGET_DEVICE=xpu
48+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
49+
50+
# Clone + patch vllm
51+
RUN git clone -b v0.8.3 https://github.com/vllm-project/vllm.git && \
52+
cd vllm && \
53+
git apply /tmp/vllm_for_multi_arc.patch && \
54+
pip install --no-cache-dir -r requirements/xpu.txt && \
55+
python3 setup.py install
56+
57+
# ======= Add oneCCL build =======
58+
RUN apt-get update && apt-get install -y \
59+
cmake \
60+
g++ \
61+
&& rm -rf /var/lib/apt/lists/*
62+
63+
# Build 1ccl
64+
RUN git clone https://github.com/oneapi-src/oneCCL.git && \
65+
cd oneCCL && \
66+
git checkout def870543749186b6f38cdc865b44d52174c7492 && \
67+
git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
68+
mkdir build && cd build && \
69+
export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
70+
/usr/bin/cmake .. \
71+
-DCMAKE_INSTALL_PREFIX=_install \
72+
-DCMAKE_C_COMPILER=icx \
73+
-DCMAKE_CXX_COMPILER=icpx \
74+
-DCOMPUTE_BACKEND=dpcpp \
75+
-DCCL_ENABLE_ARCB=1 && \
76+
make -j && make install && \
77+
mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
78+
cd /opt/intel/oneapi/ccl/ && \
79+
ln -snf 2021.15.3 latest && \
80+
source /opt/intel/oneapi/setvars.sh --force
81+
82+
WORKDIR /llm/vllm
83+
84+
# Cleanup patch file
85+
RUN rm -rf /tmp/*
86+
87+
CMD ["/bin/bash"]
88+
89+
90+
91+
# ======== OpenAI Serving Stage ========
92+
FROM vllm-base AS vllm-openai
93+
94+
ARG http_proxy
95+
ARG https_proxy
96+
97+
# install additional dependencies for openai api server
98+
RUN --mount=type=cache,target=/root/.cache/pip \
99+
pip install accelerate hf_transfer 'modelscope!=1.15.0'
100+
101+
# Set additional environment for production usage
102+
ENV VLLM_USAGE_SOURCE=production-docker-image
103+
ENV TRITON_XPU_PROFILE=1
104+
105+
# install development dependencies (for testing)
106+
RUN python3 -m pip install -e tests/vllm_test_utils
107+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
2+
From: YongZhuIntel <yong.zhu@intel.com>
3+
Date: Wed, 21 May 2025 09:37:06 +0800
4+
Subject: [PATCH] oneccl align global V0.1.1
5+
6+
base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)
7+
8+
Build:
9+
1. mkdir build; cd build
10+
2. source /opt/intel/oneapi/setvars.sh
11+
3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
12+
4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install
13+
14+
print bandwidth in benchmark
15+
---
16+
examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
17+
examples/benchmark/src/benchmark.cpp | 7 +++--
18+
2 files changed, 41 insertions(+), 6 deletions(-)
19+
20+
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
21+
index 08a3625..bff6275 100644
22+
--- a/examples/benchmark/include/benchmark.hpp
23+
+++ b/examples/benchmark/include/benchmark.hpp
24+
@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
25+
double max_time,
26+
double avg_time,
27+
double stddev,
28+
- double wait_avg_time) {
29+
+ double wait_avg_time,
30+
+ double algbw,
31+
+ double busbw) {
32+
std::ofstream csvf;
33+
csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
34+
35+
@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
36+
<< "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
37+
<< ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
38+
<< iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
39+
- << stddev << "," << wait_avg_time << std::endl;
40+
+ << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
41+
}
42+
csvf.close();
43+
}
44+
@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
45+
max_time /= iter_count;
46+
47+
size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
48+
+
49+
+ double algbw = bytes*1000/total_avg_time/1024/1024;
50+
+
51+
+ if (ncolls == 1) {
52+
+ if (options.coll_names.front() == "allgather" ||
53+
+ options.coll_names.front() == "allgatherv" ||
54+
+ options.coll_names.front() == "reducescatter" ||
55+
+ options.coll_names.front() == "alltoall" ||
56+
+ options.coll_names.front() == "alltoallv") {
57+
+ algbw = algbw * nranks;
58+
+ }
59+
+ }
60+
+
61+
+ double busbw = algbw;
62+
+ if (ncolls == 1) {
63+
+ if (options.coll_names.front() == "allreduce") {
64+
+ busbw = algbw * 2 * (nranks -1) / nranks;
65+
+ } else if (options.coll_names.front() == "allgather" ||
66+
+ options.coll_names.front() == "allgatherv" ||
67+
+ options.coll_names.front() == "reducescatter" ||
68+
+ options.coll_names.front() == "alltoall" ||
69+
+ options.coll_names.front() == "alltoallv") {
70+
+ busbw = algbw * (nranks -1) / nranks;
71+
+ }
72+
+ }
73+
+
74+
std::stringstream ss;
75+
ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
76+
<< elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
77+
<< std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
78+
<< std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
79+
<< std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
80+
- << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
81+
+ << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
82+
+ << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
83+
+ << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);
84+
85+
if (show_extened_info(options.show_additional_info)) {
86+
ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
87+
@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
88+
max_time,
89+
total_avg_time,
90+
stddev,
91+
- wait_avg_time);
92+
+ wait_avg_time,
93+
+ algbw,
94+
+ busbw);
95+
}
96+
}
97+
98+
diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
99+
index d90fb9b..78957f2 100644
100+
--- a/examples/benchmark/src/benchmark.cpp
101+
+++ b/examples/benchmark/src/benchmark.cpp
102+
@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
103+
<< "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
104+
<< std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
105+
<< std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
106+
- << "stddev[%]";
107+
+ << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
108+
+ << "busbw[GB/s]";
109+
110+
if (show_extened_info(options.show_additional_info)) {
111+
ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
112+
@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
113+
<< "t_max[usec],"
114+
<< "t_avg[usec],"
115+
<< "stddev[%],"
116+
- << "wait_t_avg[usec]" << std::endl;
117+
+ << "wait_t_avg[usec],"
118+
+ << "algbw[GB/s],"
119+
+ << "busbw[GB/s]" << std::endl;
120+
csvf.close();
121+
}
122+
123+
--
124+
2.25.1
125+
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
From 592d83da1e8d00b4436b51ccf17e7d0bc9564e24 Mon Sep 17 00:00:00 2001
2+
From: jilongW <109333127+jilongW@users.noreply.github.com>
3+
Date: Fri, 6 Jun 2025 14:27:43 +0800
4+
Subject: [PATCH] add compare (#1)
5+
6+
* add comparision
7+
8+
* update compare
9+
10+
* remove extra file
11+
---
12+
.../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp | 66 ++++++++++++++++---
13+
1 file changed, 56 insertions(+), 10 deletions(-)
14+
15+
diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
16+
index 427505b7..989c00ad 100644
17+
--- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
18+
+++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
19+
@@ -41,17 +41,55 @@ bool test(queue &Q, int M, int N, int K)
20+
constexpr int rd_size = 1048576;
21+
std::vector<T> host_vector(rd_size);
22+
auto host_data = host_vector.data();
23+
-
24+
+ std::vector<T> correct_host_vector(rd_size);
25+
+ auto correct_host_data = correct_host_vector.data();
26+
/* Measure time for a given number of GEMM calls */
27+
- auto time_gemms = [=, &Q](int runs) -> double {
28+
+ bool verify = false;
29+
+ auto time_gemms = [=, &Q, &host_data](int runs, bool verify=false) -> std::tuple<double, int> {
30+
using namespace oneapi::mkl;
31+
using namespace std::chrono;
32+
auto start = steady_clock::now();
33+
- for (int i = 0; i < runs; i++)
34+
+ int ok = 0;
35+
+ if (verify == false){
36+
+ for (int i = 0; i < runs; i++)
37+
+ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
38+
+ Q.wait_and_throw();
39+
+ auto end = steady_clock::now();
40+
+ return std::make_tuple(duration<double>(end - start).count(), ok);
41+
+ }
42+
+ else{
43+
+ size_t elems = std::min(ldc * N, rd_size);
44+
+
45+
blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
46+
- Q.wait_and_throw();
47+
- auto end = steady_clock::now();
48+
- return duration<double>(end - start).count();
49+
+ Q.wait_and_throw();
50+
+ Q.copy(C, correct_host_data, elems).wait();
51+
+ auto end = steady_clock::now();
52+
+ auto used_time = duration<double>(end - start).count();
53+
+
54+
+ // correct_host_data[0] += 1.0;
55+
+ for (int i = 1; i < runs; i++){
56+
+ start = steady_clock::now();
57+
+ blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
58+
+ Q.wait_and_throw();
59+
+ end = steady_clock::now();
60+
+ used_time += duration<double>(end - start).count();
61+
+ Q.copy(C, host_data, elems).wait();
62+
+ int linear_id = 0;
63+
+ for (size_t j = 0; j < N; j++) {
64+
+ for (size_t k = 0; k < M; k++) {
65+
+ linear_id = j*ldc + k;
66+
+ if (linear_id >= elems) break;
67+
+ if (host_data[linear_id] != correct_host_data[linear_id]) {
68+
+ ok = i;
69+
+ return std::make_tuple(duration<double>(end - start).count(), ok);
70+
+ }
71+
+ }
72+
+ if (linear_id >= elems) break;
73+
+ }
74+
+
75+
+ }
76+
+ return std::make_tuple(used_time, ok);
77+
+ }
78+
};
79+
80+
/* Fill A/B with all ones to verify correctness */
81+
@@ -91,13 +129,15 @@ bool test(queue &Q, int M, int N, int K)
82+
83+
/* Time one GEMM call, and estimate how many calls will be required to keep the
84+
* GPU busy for 1s. */
85+
- auto tare = time_gemms(1);
86+
+ auto [tare, _] = time_gemms(1, true);
87+
int ncalls = std::max(4, std::min(1000, int(1. / tare)));
88+
89+
/* Time that many GEMMs, subtracting the first call time to remove host overhead.
90+
* This gives a better idea of device performance. */
91+
std::cout << " -> Timing...\n";
92+
- auto time = time_gemms(ncalls + 1) - tare;
93+
+ auto [time, result] = time_gemms(ncalls + 1, true);
94+
+ time -= tare;
95+
+
96+
auto avg = time / ncalls;
97+
98+
/* Calculate and display performance */
99+
@@ -114,8 +154,14 @@ bool test(queue &Q, int M, int N, int K)
100+
flops *= 1e-3;
101+
unit = 'P';
102+
}
103+
-
104+
- std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
105+
+ if (result != 0){
106+
+ std::cout << "gemm FAILS" << " for type: " << type_string<T>() << " on " << result <<" times run!"<< "\n";
107+
+ }
108+
+ else{
109+
+ std::cout << "gemm Passes" << " for type: " << type_string<T>() << "!\n";
110+
+ std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
111+
+ }
112+
+
113+
114+
/* Free data */
115+
free(C, Q);
116+
--
117+
2.34.1
118+

0 commit comments

Comments
 (0)