diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4a5b62a..3bab505 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,8 +12,47 @@ concurrency: group: build-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: - build-dependencies: - name: Build the FlexFlow dependencies + download-nccl: + name: Download and Package NCCL + strategy: + matrix: + os: [ubuntu-18.04, ubuntu-20.04] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + + - name: Free additional space on runner + run: .github/workflows/helpers/free_space_on_runner.sh + + - name: Download NCCL + run: .github/workflows/helpers/download_nccl.sh + + - name: Prepare library files + working-directory: nccl_downloads + run: | + for folder in *; do + if [ -d "$folder" ]; then + cd $folder + export NCCL_TARBALL="nccl_${{ matrix.os }}_${folder}.tar.gz" + echo "Creating archive $NCCL_TARBALL" + tar -zcvf $NCCL_TARBALL nccl + echo "Checking the size of the NCCL tarball..." + du -h $NCCL_TARBALL + mv $NCCL_TARBALL ../ + cd .. + fi + done + + - name: Archive compiled NCCL libraries + uses: actions/upload-artifact@v3 + with: + name: nccl_${{ matrix.os }} + path: nccl_downloads/*.tar.gz + + build-legion: + name: Build Legion strategy: matrix: os: [ubuntu-18.04, ubuntu-20.04] @@ -23,17 +62,15 @@ jobs: "10.2.89", "11.0.3", "11.1.1", - "11.2.2", - "11.3.1", - "11.4.3", - "11.5.2", - "11.6.2", + "11.2.0", + "11.3.0", + "11.4.0", + "11.5.0", + "11.6.0", "11.7.0", ] gpu_backend: [cuda, hip_rocm] - # uncomment the line below (and related ones) to build nccl, legion in parallel. Because - # git only supports up to 20 jobs in parallel, building in parallel is currently not needed. - #dependency: ["nccl", "legion"] + python_version: ["3.7", "3.8", "3.9", "3.10"] exclude: - os: ubuntu-20.04 cuda_version: "10.1.243" @@ -48,15 +85,15 @@ jobs: gpu_backend: "hip_rocm" - cuda_version: "11.0.3" gpu_backend: "hip_rocm" - - cuda_version: "11.2.2" + - cuda_version: "11.2.0" gpu_backend: "hip_rocm" - - cuda_version: "11.3.1" + - cuda_version: "11.3.0" gpu_backend: "hip_rocm" - - cuda_version: "11.4.3" + - cuda_version: "11.4.0" gpu_backend: "hip_rocm" - - cuda_version: "11.5.2" + - cuda_version: "11.5.0" gpu_backend: "hip_rocm" - - cuda_version: "11.6.2" + - cuda_version: "11.6.0" gpu_backend: "hip_rocm" - cuda_version: "11.7.0" gpu_backend: "hip_rocm" @@ -82,11 +119,11 @@ jobs: env: CUDA_VERSION: ${{ matrix.cuda_version }} FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + PY_VERSION: ${{ matrix.python_version }} run: .github/workflows/helpers/install_dependencies.sh - - name: Build NCCL/Legion + - name: Build Legion env: - #DEPENDENCY: ${{ matrix.dependency }} CUDA_VERSION: ${{ matrix.cuda_version }} FF_GPU_BACKEND: ${{ matrix.gpu_backend }} run: | @@ -95,9 +132,7 @@ jobs: export CUDA_DIR=/usr/local/cuda export FF_BUILD_LEGION=ON - if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_USE_NCCL=ON - fi + export FF_USE_NCCL=OFF cores_available=$(nproc --all) n_build_cores=$(( cores_available -1 )) @@ -112,55 +147,43 @@ jobs: - name: Prepare library files env: FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + CUDA_VERSION: ${{ matrix.cuda_version }} run: | - # Remove unnecessary files - echo "Removing unnecessary files..." - rm -rf build/deps/nccl/obj build/deps/nccl/src build/deps/nccl/tmp - rm -f build/export/legion/lib/libflexflow.so + export CUDA_VERSION_MAJOR="${CUDA_VERSION:0:4}" + echo "CUDA_VERSION_MAJOR=${CUDA_VERSION:0:4}" >> $GITHUB_ENV if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export NCCL_TARBALL="nccl_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz" - export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz" - - # Only build NCCL tarball for CUDA backends - echo "Creating archive $NCCL_TARBALL" - tar -zcvf $NCCL_TARBALL build/deps/nccl/ - echo "Checking the size of the NCCL tarball..." - du -h $NCCL_TARBALL + export LEGION_TARBALL="legion_${{ matrix.os }}_cuda-${CUDA_VERSION_MAJOR}_python${{ matrix.python_version }}.tar.gz" else - export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.gpu_backend }}.tar.gz" + export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }}.tar.gz" fi echo "Creating archive $LEGION_TARBALL" - tar -zcvf $LEGION_TARBALL build/export/legion/ + cd build/export + tar -zcvf $LEGION_TARBALL legion echo "Checking the size of the Legion tarball..." du -h $LEGION_TARBALL + mv $LEGION_TARBALL ../../ + cd ../../ - name: Archive compiled Legion library (CUDA) uses: actions/upload-artifact@v3 if: ${{ matrix.gpu_backend == 'cuda' }} with: - name: legion_${{ matrix.os }}_${{ matrix.cuda_version }} - path: legion_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz + name: legion_${{ matrix.os }}_cuda-${{ env.CUDA_VERSION_MAJOR }}_python${{ matrix.python_version }} + path: legion_${{ matrix.os }}_cuda-${{ env.CUDA_VERSION_MAJOR }}_python${{ matrix.python_version }}.tar.gz - name: Archive compiled Legion library (HIP) uses: actions/upload-artifact@v3 if: ${{ matrix.gpu_backend != 'cuda' }} with: - name: legion_${{ matrix.os }}_${{ matrix.gpu_backend }} - path: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}.tar.gz - - - name: Archive compiled NCCL library (CUDA) - uses: actions/upload-artifact@v3 - if: ${{ matrix.gpu_backend == 'cuda' }} - with: - name: nccl_${{ matrix.os }}_${{ matrix.cuda_version }} - path: nccl_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz + name: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }} + path: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }}.tar.gz notify-slack: name: Notify Slack in case of failure runs-on: ubuntu-20.04 - needs: build-dependencies + needs: [download-nccl, build-legion] if: ${{ failure() && github.event_name == 'schedule' }} steps: - name: Send Slack message @@ -172,7 +195,7 @@ jobs: create-release: name: Create new release runs-on: ubuntu-20.04 - needs: build-dependencies + needs: [download-nccl, build-legion] steps: - name: Checkout Git Repository uses: actions/checkout@v3 diff --git a/.github/workflows/helpers/download_nccl.sh b/.github/workflows/helpers/download_nccl.sh new file mode 100755 index 0000000..f83d63b --- /dev/null +++ b/.github/workflows/helpers/download_nccl.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -euo pipefail +set -x + +mkdir -p nccl_downloads +cd nccl_downloads + +ubuntu_version=$(lsb_release -rs) +ubuntu_version=${ubuntu_version//./} +wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb" +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt-get update -y +rm -f cuda-keyring_1.0-1_all.deb + +if [[ "$ubuntu_version" == "2004" ]]; then + sudo apt download libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0 + sudo apt download libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1 + sudo apt download libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2 + sudo apt download libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3 + sudo apt download libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4 + sudo apt download libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5 + sudo apt download libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6 + sudo apt download libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7 +elif [[ "$ubuntu_version" == "1804" ]]; then + # Additional key required to download the CUDA 10.1 version + sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb + sudo dpkg -i nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb + sudo apt-get update -y + rm -f nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb + sudo apt download libnccl2=2.8.3-1+cuda10.1 libnccl-dev=2.8.3-1+cuda10.1 + sudo apt download libnccl2=2.15.5-1+cuda10.2 libnccl-dev=2.15.5-1+cuda10.2 + sudo apt download libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0 + sudo apt download libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1 + sudo apt download libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2 + sudo apt download libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3 + sudo apt download libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4 + sudo apt download libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5 + sudo apt download libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6 + sudo apt download libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7 +fi + +for debfile in *.deb; do + temp_str=${debfile#*+} + temp_str=${temp_str%_*} + cuda_version=${temp_str:4} + mkdir -p "cuda-$cuda_version/nccl" + dpkg-deb -xv "$debfile" "./cuda-$cuda_version/nccl" + cd "cuda-$cuda_version/nccl" + [ -d ./usr/include ] && mv ./usr/include ./ + mkdir -p lib + files_to_move=(./usr/lib/x86_64-linux-gnu/*.a) + [ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.a ./lib/ + files_to_move=(./usr/lib/x86_64-linux-gnu/*.so) + [ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.so ./lib/ + files_to_move=(./usr/lib/x86_64-linux-gnu/*.so.*) + [ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.so.* ./lib/ + symlinks_to_move="$(find ./usr/lib/x86_64-linux-gnu/ -type l )" + for s in $symlinks_to_move; do + fname="$(basename "$s")" + ln -s "$(readlink "$s" )" "./lib/$fname" + done + rm -rf usr + cd ../../ +done + +rm -rf ./*.deb diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 866eaf8..4f1e6fc 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -46,9 +46,9 @@ wget -c -q $CUDNN_LINK if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" ]]; then tar -xf $CUDNN_TARBALL_NAME -C ./ CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}" - sudo cp -r $CUDNN_EXTRACTED_TARBALL_NAME/include/* /usr/local/include - sudo cp -r $CUDNN_EXTRACTED_TARBALL_NAME/lib/* /usr/local/lib - rm -rf $CUDNN_EXTRACTED_TARBALL_NAME + sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* "/usr/local/include" + sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* "/usr/local/lib" + rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME" else sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local fi diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh index 561f583..3894fbf 100755 --- a/.github/workflows/helpers/install_dependencies.sh +++ b/.github/workflows/helpers/install_dependencies.sh @@ -14,12 +14,34 @@ sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binu CUDA_VERSION=${CUDA_VERSION:-11.1.1} ./install_cudnn.sh "${CUDA_VERSION}" -# Install Miniconda +#Install Miniconda echo "Installing Miniconda..." -wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ./Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ - rm ./Miniconda3-latest-Linux-x86_64.sh && \ +PY_VERSION=${PY_VERSION:-latest} +MINICONDA_BASE_URL="https://repo.continuum.io/miniconda/" +if [[ "$PY_VERSION" == "latest" ]]; then + echo "Installing latest Python version" + MINICONDA_INSTALLER="Miniconda3-latest-Linux-x86_64.sh" +elif [[ "$PY_VERSION" == "3.10" ]]; then + echo "Installing Python version ${PY_VERSION}" + MINICONDA_INSTALLER="Miniconda3-py310_22.11.1-1-Linux-x86_64.sh" +elif [[ "$PY_VERSION" == "3.9" ]]; then + echo "Installing Python version ${PY_VERSION}" + MINICONDA_INSTALLER="Miniconda3-py39_22.11.1-1-Linux-x86_64.sh" +elif [[ "$PY_VERSION" == "3.8" ]]; then + echo "Installing Python version ${PY_VERSION}" + MINICONDA_INSTALLER="Miniconda3-py38_22.11.1-1-Linux-x86_64.sh" +elif [[ "$PY_VERSION" == "3.7" ]]; then + echo "Installing Python version ${PY_VERSION}" + MINICONDA_INSTALLER="Miniconda3-py37_22.11.1-1-Linux-x86_64.sh" +else + echo "Request Python version (${PY_VERSION}) not supported" + exit 1 +fi +MINICONDA_URL="${MINICONDA_BASE_URL}${MINICONDA_INSTALLER}" +wget -c -q $MINICONDA_URL && \ + chmod +x $MINICONDA_INSTALLER && \ + bash $MINICONDA_INSTALLER -b -p /opt/conda && \ + rm $MINICONDA_INSTALLER && \ /opt/conda/bin/conda upgrade --all && \ /opt/conda/bin/conda install conda-build conda-verify && \ /opt/conda/bin/conda clean -ya diff --git a/.github/workflows/shell-check.yml b/.github/workflows/shell-check.yml new file mode 100644 index 0000000..a51803e --- /dev/null +++ b/.github/workflows/shell-check.yml @@ -0,0 +1,10 @@ +name: Shell Check +on: [push, pull_request, workflow_dispatch] +jobs: + shellcheck: + name: Shellcheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master diff --git a/deps/legion b/deps/legion index 15b23cf..7f8df4e 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 15b23cf0d95f186297f05c76611ddd3e2cbbe9f9 +Subproject commit 7f8df4ee66896acf1c1f5ac8f43808596046f54b