intel
diff --git a/‎vllm/docker/Dockerfile‎
Lines changed: 107 additions & 0 deletions b/‎vllm/docker/Dockerfile‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎vllm/patches/0001-oneccl-align-global-V0.1.1.patch‎
Lines changed: 125 additions & 0 deletions b/‎vllm/patches/0001-oneccl-align-global-V0.1.1.patch‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎vllm/patches/oneapi-samples-enable-correctness-check.patch‎
Lines changed: 118 additions & 0 deletions b/‎vllm/patches/oneapi-samples-enable-correctness-check.patch‎
Lines changed: 118 additions & 0 deletions
@@ -0,0 +1,107 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# ======== Base Stage ========
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu24.04 AS vllm-base
+
+ARG https_proxy
+ARG http_proxy
+
+# Add Intel oneAPI repo and PPA for GPU support
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-testing
+
+# Install dependencies and Python 3.10
+RUN apt-get update -y && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update -y && \
+    apt-get install -y python3.10 python3.10-distutils && \
+    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        curl \
+        ffmpeg \
+        git \
+        libsndfile1 \
+        libsm6 \
+        libxext6 \
+        libgl1 \
+        lsb-release \
+        numactl \
+        wget \
+        vim \
+        linux-libc-dev && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    # Install Intel GPU runtime packages
+    apt-get update -y && \
+    apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /llm
+COPY ./patches/vllm_for_multi_arc.patch /tmp/
+COPY ./patches/0001-oneccl-align-global-V0.1.1.patch /tmp/
+
+# Set environment variables early
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+ENV VLLM_TARGET_DEVICE=xpu
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Clone + patch vllm
+RUN git clone -b v0.8.3 https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    git apply /tmp/vllm_for_multi_arc.patch && \
+    pip install --no-cache-dir -r requirements/xpu.txt && \
+    python3 setup.py install
+
+# ======= Add oneCCL build =======
+RUN apt-get update && apt-get install -y \
+    cmake \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Build 1ccl
+RUN git clone https://github.com/oneapi-src/oneCCL.git && \
+    cd oneCCL && \
+    git checkout def870543749186b6f38cdc865b44d52174c7492 && \
+    git apply /tmp/0001-oneccl-align-global-V0.1.1.patch && \
+    mkdir build && cd build && \
+    export IGC_VISAOptions=-activeThreadsOnlyBarrier && \
+    /usr/bin/cmake .. \
+        -DCMAKE_INSTALL_PREFIX=_install \
+        -DCMAKE_C_COMPILER=icx \
+        -DCMAKE_CXX_COMPILER=icpx \
+        -DCOMPUTE_BACKEND=dpcpp \
+        -DCCL_ENABLE_ARCB=1 && \
+    make -j && make install && \
+    mv _install /opt/intel/oneapi/ccl/2021.15.3 && \
+    cd /opt/intel/oneapi/ccl/ && \
+    ln -snf 2021.15.3 latest && \
+    source /opt/intel/oneapi/setvars.sh --force
+
+WORKDIR /llm/vllm
+
+# Cleanup patch file
+RUN rm -rf /tmp/*
+
+CMD ["/bin/bash"]
+
+
+
+# ======== OpenAI Serving Stage ========
+FROM vllm-base AS vllm-openai
+
+ARG http_proxy
+ARG https_proxy
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+
+# Set additional environment for production usage
+ENV VLLM_USAGE_SOURCE=production-docker-image
+ENV TRITON_XPU_PROFILE=1
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
@@ -0,0 +1,125 @@
+From 7f7a3d65541828d9889bfdec799bc23339e8e520 Mon Sep 17 00:00:00 2001
+From: YongZhuIntel <yong.zhu@intel.com>
+Date: Wed, 21 May 2025 09:37:06 +0800
+Subject: [PATCH] oneccl align global V0.1.1
+
+base on public branch release/ccl_2021.15.3-arc(def870543749186b6f38cdc865b44d52174c7492)
+
+Build:
+       1. mkdir build; cd build
+       2. source /opt/intel/oneapi/setvars.sh
+       3. export IGC_VISAOptions=-activeThreadsOnlyBarrier
+       4. cmake .. -DCMAKE_INSTALL_PREFIX=_install -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCCL_ENABLE_ARCB=1 && make -j && make install
+
+print bandwidth in benchmark
+---
+ examples/benchmark/include/benchmark.hpp | 40 +++++++++++++++++++++---
+ examples/benchmark/src/benchmark.cpp     |  7 +++--
+ 2 files changed, 41 insertions(+), 6 deletions(-)
+
+diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
+index 08a3625..bff6275 100644
+--- a/examples/benchmark/include/benchmark.hpp
++++ b/examples/benchmark/include/benchmark.hpp
+@@ -377,7 +377,9 @@ void store_to_csv(const user_options_t& options,
+                   double max_time,
+                   double avg_time,
+                   double stddev,
+-                  double wait_avg_time) {
++                  double wait_avg_time,
++		  double algbw,
++		  double busbw) {
+     std::ofstream csvf;
+     csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
+ 
+@@ -396,7 +398,7 @@ void store_to_csv(const user_options_t& options,
+                  << "," << ccl::get_datatype_size(dtype) << "," << elem_count << ","
+                  << ccl::get_datatype_size(dtype) * elem_count << "," << buf_count << ","
+                  << iter_count << "," << min_time << "," << max_time << "," << avg_time << ","
+-                 << stddev << "," << wait_avg_time << std::endl;
++                 << stddev << "," << wait_avg_time << "," << algbw << "," << busbw << std::endl;
+         }
+         csvf.close();
+     }
+@@ -472,13 +474,41 @@ void print_timings(const ccl::communicator& comm,
+         max_time /= iter_count;
+ 
+         size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
++
++        double algbw = bytes*1000/total_avg_time/1024/1024;
++
++         if (ncolls == 1) {
++             if (options.coll_names.front() == "allgather" ||
++                 options.coll_names.front() == "allgatherv" ||
++                 options.coll_names.front() == "reducescatter" ||
++                 options.coll_names.front() == "alltoall" ||
++                 options.coll_names.front() == "alltoallv") {
++                 algbw = algbw * nranks;
++            }
++         }
++
++        double busbw = algbw;
++        if (ncolls == 1) {
++            if (options.coll_names.front() == "allreduce") {
++                busbw = algbw * 2 * (nranks -1) / nranks;
++            } else if (options.coll_names.front() == "allgather" ||
++                options.coll_names.front() == "allgatherv" ||
++                options.coll_names.front() == "reducescatter" ||
++                options.coll_names.front() == "alltoall" ||
++                options.coll_names.front() == "alltoallv") {
++                busbw = algbw * (nranks -1) / nranks;
++           }
++        }
++
+         std::stringstream ss;
+         ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
+            << elem_count * buf_count << std::setw(COL_WIDTH) << iter_count << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << min_time << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << max_time << std::setw(COL_WIDTH)
+            << std::setprecision(COL_PRECISION) << total_avg_time << std::setw(COL_WIDTH - 3)
+-           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH + 3);
++           << std::setprecision(COL_PRECISION) << stddev << std::setw(COL_WIDTH)
++	   << std::setprecision(COL_PRECISION) << algbw << std::setw(COL_WIDTH)
++	   << std::setprecision(COL_PRECISION) << busbw << std::setw(COL_WIDTH + 3);
+ 
+         if (show_extened_info(options.show_additional_info)) {
+             ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
+@@ -497,7 +527,9 @@ void print_timings(const ccl::communicator& comm,
+                          max_time,
+                          total_avg_time,
+                          stddev,
+-                         wait_avg_time);
++                         wait_avg_time,
++			 algbw,
++			 busbw);
+         }
+     }
+ 
+diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
+index d90fb9b..78957f2 100644
+--- a/examples/benchmark/src/benchmark.cpp
++++ b/examples/benchmark/src/benchmark.cpp
+@@ -105,7 +105,8 @@ void run(ccl::communicator& service_comm,
+                    << "#elem_count" << std::setw(COL_WIDTH) << "#repetitions"
+                    << std::setw(COL_WIDTH) << "t_min[usec]" << std::setw(COL_WIDTH) << "t_max[usec]"
+                    << std::setw(COL_WIDTH) << "t_avg[usec]" << std::setw(COL_WIDTH - 3)
+-                   << "stddev[%]";
++                   << "stddev[%]" << std::setw(COL_WIDTH) << "algbw[GB/s]" << std::setw(COL_WIDTH)
++		   << "busbw[GB/s]";
+ 
+                 if (show_extened_info(options.show_additional_info)) {
+                     ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
+@@ -435,7 +436,9 @@ int main(int argc, char* argv[]) {
+              << "t_max[usec],"
+              << "t_avg[usec],"
+              << "stddev[%],"
+-             << "wait_t_avg[usec]" << std::endl;
++             << "wait_t_avg[usec],"
++             << "algbw[GB/s],"
++             << "busbw[GB/s]" << std::endl;
+         csvf.close();
+     }
+ 
+-- 
+2.25.1
+
@@ -0,0 +1,118 @@
+From 592d83da1e8d00b4436b51ccf17e7d0bc9564e24 Mon Sep 17 00:00:00 2001
+From: jilongW <109333127+jilongW@users.noreply.github.com>
+Date: Fri, 6 Jun 2025 14:27:43 +0800
+Subject: [PATCH] add compare (#1)
+
+* add comparision
+
+* update compare
+
+* remove extra file
+---
+ .../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp  | 66 ++++++++++++++++---
+ 1 file changed, 56 insertions(+), 10 deletions(-)
+
+diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
+index 427505b7..989c00ad 100644
+--- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
++++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
+@@ -41,17 +41,55 @@ bool test(queue &Q, int M, int N, int K)
+     constexpr int rd_size = 1048576;
+     std::vector<T> host_vector(rd_size);
+     auto host_data = host_vector.data();
+-
++    std::vector<T> correct_host_vector(rd_size);
++    auto correct_host_data = correct_host_vector.data();
+     /* Measure time for a given number of GEMM calls */
+-    auto time_gemms = [=, &Q](int runs) -> double {
++    bool verify = false;
++    auto time_gemms = [=, &Q, &host_data](int runs, bool verify=false) -> std::tuple<double, int> {
+         using namespace oneapi::mkl;
+         using namespace std::chrono;
+         auto start = steady_clock::now();
+-        for (int i = 0; i < runs; i++)
++        int ok = 0;
++        if (verify == false){
++            for (int i = 0; i < runs; i++)
++                blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
++            Q.wait_and_throw();
++            auto end = steady_clock::now();
++            return std::make_tuple(duration<double>(end - start).count(), ok);
++        }
++        else{
++            size_t elems = std::min(ldc * N, rd_size);
++            
+             blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
+-        Q.wait_and_throw();
+-        auto end = steady_clock::now();
+-        return duration<double>(end - start).count();
++            Q.wait_and_throw();
++            Q.copy(C, correct_host_data, elems).wait();
++            auto end = steady_clock::now();
++            auto used_time = duration<double>(end - start).count();
++
++            // correct_host_data[0] += 1.0;
++            for (int i = 1; i < runs; i++){
++                start = steady_clock::now();
++                blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
++                Q.wait_and_throw();
++                end = steady_clock::now();
++                used_time += duration<double>(end - start).count();
++                Q.copy(C, host_data, elems).wait();
++                int linear_id = 0;
++                for (size_t j = 0; j < N; j++) {
++                    for (size_t k = 0; k < M; k++) {
++                        linear_id = j*ldc + k;
++                        if (linear_id >= elems) break;
++                        if (host_data[linear_id] != correct_host_data[linear_id]) {
++                            ok = i;
++                            return std::make_tuple(duration<double>(end - start).count(), ok);
++                        }
++                    }
++                    if (linear_id >= elems) break;
++                }
++                
++            }
++            return std::make_tuple(used_time, ok);
++        }
+     };
+ 
+     /* Fill A/B with all ones to verify correctness */
+@@ -91,13 +129,15 @@ bool test(queue &Q, int M, int N, int K)
+ 
+     /* Time one GEMM call, and estimate how many calls will be required to keep the
+      * GPU busy for 1s. */
+-    auto tare = time_gemms(1);
++    auto [tare, _] = time_gemms(1, true);
+     int ncalls = std::max(4, std::min(1000, int(1. / tare)));
+ 
+     /* Time that many GEMMs, subtracting the first call time to remove host overhead.
+      * This gives a better idea of device performance. */
+     std::cout << " -> Timing...\n";
+-    auto time = time_gemms(ncalls + 1) - tare;
++    auto [time, result] = time_gemms(ncalls + 1, true);
++    time -= tare;
++   
+     auto avg = time / ncalls;
+ 
+     /* Calculate and display performance */
+@@ -114,8 +154,14 @@ bool test(queue &Q, int M, int N, int K)
+         flops *= 1e-3;
+         unit = 'P';
+     }
+-
+-    std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
++     if (result != 0){
++        std::cout << "gemm FAILS" << " for type: " << type_string<T>() << " on " << result <<" times run!"<< "\n";
++    }
++    else{
++        std::cout << "gemm Passes" << " for type: " << type_string<T>() << "!\n";
++        std::cout << "\nAverage performance: " << flops << unit << 'F' << "\n";
++    }
++    
+ 
+     /* Free data */
+     free(C, Q);
+-- 
+2.34.1
+