diff --git a/.github/actions/setup-python-tools/action.yml b/.github/actions/setup-python-tools/action.yml index 9a7572a95..36e1b914c 100644 --- a/.github/actions/setup-python-tools/action.yml +++ b/.github/actions/setup-python-tools/action.yml @@ -37,28 +37,16 @@ runs: using: "composite" # needs: comment-link-to-workflow # Ensure that a comment is posted with workflow id steps: - # Step 1: Set up Python environment (Python 3.9.13). - - name: Set up Python - uses: actions/setup-python@v4 + # Step 1: Set up Python environment + - name: "Set up Python" + uses: actions/setup-python@v6 with: - # Available versions: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json - # Ensure to use a version that has support for arm64-darwin so we can build for Apple Silicon (macOS 14). - python-version: '3.9.13' + python-version-file: ".python-version" - # Step 2: Install pip-tools, which is used to generate hashed requirements. - # Note_1: pip 25.1 has a bug that causes pip-tools to fail with the following error: - # File ".../python3.9/site-packages/piptools/repositories/pypi.py", line 452, in allow_all_wheels - # self.finder.find_all_candidates.cache_clear() - # AttributeError: 'function' object has no attribute 'cache_clear' - # Note_2: Even though some wheels are guarded behind conditionals i.e. only use this if platform = linux; - # pip-tools 7.5.0 fails with the following error: - # pip._internal.exceptions.UnsupportedWheel: pyg_lib-0.4....linux_x86_64.whl is not a supported wheel on this platform. - # Thus, we fix the pip version to 25.0.1 and pip-tools version to 7.4.1. - - name: Install pip-tools - shell: bash - run: | - python -m pip install "pip==25.0.1" - python -m pip install "pip-tools==7.4.1" + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.9.5" # Matches the version in install_py_deps.sh # Step 3: Set up Gcloud AUTH using Workload Identity Federation # See following for context: https://cloud.google.com/blog/products/identity-security/enabling-keyless-authentication-from-github-actions diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 63ea9be26..91135a88e 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 entrypoint: /bin/bash args: - -c @@ -18,15 +18,18 @@ steps: echo "Setting up environment..." # gcloud runner will run as a non-root user, but all paths/profiles, etc are set up for root + mkdir -p /builder/home/.local/bin + cp -r /root/.local/bin/ /builder/home/.local/ echo "source /root/.bashrc" >> ~/.bashrc echo "source /root/.profile" >> ~/.profile source ~/.profile + docker version docker buildx create --driver=docker-container --use docker run --rm --privileged multiarch/qemu-user-static --reset -p yes gcloud auth configure-docker us-central1-docker.pkg.dev # Install GiGL - pip install -e ./python/ + uv pip install -e . # The builder operates in its own user dir, usually /workspace, # so we need to copy the gigl tools dir to the current cloud_builder's user dir. # See: containers/Dockerfile.builder. diff --git a/.github/scripts/update_docker_image_refs.sh b/.github/scripts/update_docker_image_refs.sh new file mode 100644 index 000000000..815b1eee2 --- /dev/null +++ b/.github/scripts/update_docker_image_refs.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Script to update dep_vars.env and cloud builder config with new Docker image references + +set -e + +echo "Writing new image names to dep_vars.env:" +echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" +echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" +echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" +echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" + +sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env +sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 9d848ecad..faec56e8e 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -6,6 +6,7 @@ on: pr_number: description: 'PR to run the workflow on' required: true + env: DOCKER_BUILDKIT: 1 GIGL_BASE_CUDA_IMAGE: us-central1-docker.pkg.dev/${{ vars.GCP_PROJECT_ID }}/public-gigl/gigl-cuda-base:${{ github.sha }}.${{ github.run_number }}.${{ github.run_attempt }} @@ -16,6 +17,7 @@ env: jobs: comment-workflow-started: + runs-on: ubuntu-latest steps: - name: Comment on PR @@ -29,7 +31,7 @@ jobs: Once done, the workflow will update the `dep_vars.env` file with the new image names. build-cuda-base-image: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -41,7 +43,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -56,8 +58,8 @@ jobs: docker push ${GIGL_BASE_CUDA_IMAGE} echo "Pushed CUDA base image to ${GIGL_BASE_CUDA_IMAGE}" - build-cpu-base-images: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + build-cpu-base-image: + runs-on: ubuntu-latest permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -69,14 +71,13 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} - - name: Build and Push CPU Base Image and Docker CPU Image run: | gcloud auth configure-docker us-central1-docker.pkg.dev @@ -85,8 +86,30 @@ jobs: docker push ${GIGL_BASE_CPU_IMAGE} echo "Pushed CPU base image to ${GIGL_BASE_CPU_IMAGE}" - echo "Will use CPU image ${GIGL_BASE_CPU_IMAGE} as base image for Dataflow image." - docker build -f ./containers/Dockerfile.dataflow.base --build-arg BASE_IMAGE=${GIGL_BASE_CPU_IMAGE} -t ${GIGL_BASE_DATAFLOW_IMAGE} . + build-dataflow-base-image: + runs-on: ubuntu-latest + permissions: + # Needed for gcloud auth: https://github.com/google-github-actions/auth + contents: 'read' + id-token: 'write' + steps: + - name: Checkout PR Branch + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + pr_number: ${{ inputs.pr_number }} + - name: Setup Machine for building Docker images + uses: ./.github/actions/setup-python-tools + with: + setup_gcloud: "true" + try_cleaning_disk_space: "true" + gcp_project_id: ${{ vars.GCP_PROJECT_ID }} + workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} + gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + - name: Build and Push Dataflow Base Image + run: | + gcloud auth configure-docker us-central1-docker.pkg.dev + docker build -f ./containers/Dockerfile.dataflow.base -t ${GIGL_BASE_DATAFLOW_IMAGE} . docker push ${GIGL_BASE_DATAFLOW_IMAGE} echo "Pushed Dataflow base image to ${GIGL_BASE_DATAFLOW_IMAGE}" @@ -103,7 +126,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -124,7 +147,8 @@ jobs: build-and-commit-base-images: needs: - build-cuda-base-image - - build-cpu-base-images + - build-cpu-base-image + - build-dataflow-base-image - build-builder-image runs-on: ubuntu-latest steps: @@ -134,23 +158,12 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} should_leave_progress_comments: "false" - command: | - echo "Writing new image names to dep_vars.env:" - echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" - echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" - echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" - echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" - sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env - sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml - + command: bash .github/scripts/update_docker_image_refs.sh - name: Commit and Push Dep Vars uses: snapchat/gigl/.github/actions/commit-and-push@main with: commit_message: "[AUTOMATED] Update dep.vars, and other relevant files with new image names" github_token: ${{ secrets.GITHUB_TOKEN }} - - uses: snapchat/gigl/.github/actions/comment-on-pr@main with: pr_number: ${{ inputs.pr_number }} diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml index 4d6032747..cc57aed64 100644 --- a/.github/workflows/on-pr-comment.yml +++ b/.github/workflows/on-pr-comment.yml @@ -24,7 +24,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version-file: ".python-version" - name: Install PyYAML run: pip install PyYAML @@ -164,6 +164,7 @@ jobs: workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index 75bfccd9d..0e1f9ddd0 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -26,7 +26,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} @@ -37,7 +37,7 @@ jobs: # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand # how to leverage Workload Identity Federation to read assets from GCS, et al. See: # https://github.com/tensorflow/tensorflow/issues/57104 - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make unit_test_py" service_account: ${{ secrets.gcp_service_account_email }} @@ -53,14 +53,18 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Scala Unit Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + # We use cloud run here instead of using github hosted runners because of limitation of tests + # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand + # how to leverage Workload Identity Federation to read assets from GCS, et al. See: + # https://github.com/tensorflow/tensorflow/issues/57104 + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make unit_test_scala" service_account: ${{ secrets.gcp_service_account_email }} @@ -72,14 +76,14 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Integration Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make integration_test" service_account: ${{ secrets.gcp_service_account_email }} @@ -91,14 +95,14 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run E2E Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make run_all_e2e_tests" service_account: ${{ secrets.gcp_service_account_email }} @@ -111,14 +115,14 @@ jobs: # steps: # - uses: actions/checkout@v4 # - name: Setup development environment - # uses: snapchat/gigl/.github/actions/setup-python-tools@main + # uses: ./.github/actions/setup-python-tools # with: # setup_gcloud: "true" # gcp_project_id: ${{ vars.GCP_PROJECT_ID }} # workload_identity_provider: ${{ secrets.workload_identity_provider }} # gcp_service_account_email: ${{ secrets.gcp_service_account_email }} # - name: Run Example Notebook E2E Tests - # uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + # uses: ./.github/actions/run-cloud-run-command-on-active-checkout # with: # cmd: "make notebooks_test" # service_account: ${{ secrets.gcp_service_account_email }} @@ -130,7 +134,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: install_dev_deps: "true" setup_gcloud: "true" @@ -138,7 +142,9 @@ jobs: workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Lint Tests + shell: bash run: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index 5e9345b61..bc25d47ba 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -44,7 +44,7 @@ jobs: # We also make gigl available w/ editable install `-e` so that autodoc can find it. - name: Install necessary doc dependencies run: | - pip install -e "./python[docs]" + uv sync --extra docs - name: Sphinx build run: | make build_docs diff --git a/Makefile b/Makefile index 31abeb2ab..8d11a4e50 100644 --- a/Makefile +++ b/Makefile @@ -1,22 +1,12 @@ include dep_vars.env SHELL := /bin/bash -CONDA_ENV_NAME=gnn -# Ensure that python, pip, and pip-tools versions are consistent with the ones in: -# .github/actions/setup-python-tools/action.yml - -# TODO(svij): Remove python version as it is now managed by uv. Subsequently update all references in -# Makfile to work w/ uv. -PYTHON_VERSION=3.9 -PIP_VERSION=25.0.1 -PIP_TOOLS_VERSION=7.4.1 DATE:=$(shell /bin/date "+%Y%m%d_%H%M") # GIT HASH, or empty string if not in a git repo. GIT_HASH?=$(shell git rev-parse HEAD 2>/dev/null || "") PWD=$(shell pwd) - # You can override GIGL_PROJECT by setting it in your environment i.e. # adding `export GIGL_PROJECT=your_project` to your shell config (~/.bashrc, ~/.zshrc, etc.) GIGL_PROJECT?=external-snap-ci-github-gigl @@ -41,117 +31,45 @@ GIGL_E2E_TEST_COMPILED_PIPELINE_PATH:=/tmp/gigl/pipeline_${DATE}_${GIT_HASH}.yam GIT_BRANCH:=$(shell git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "") -# If we're in a git repo, then find only the ".md" files in our repo to format, else we format everything ".". -# We do this because some of our dependencies (Spark) include md files, -# but since we don't push those dependenices (or their documentation) to git, -# then when we *check* the format of those files, we will fail. -# Thus, we only want to format the Markdown files that we explicitly include in our repo. -MD_FILES:=$(shell if [ ! ${GIT_BRANCH} ]; then echo "."; else git ls-tree --name-only -r ${GIT_BRANCH} . | grep "\.md$$"; fi;) +# Find all markdown files in the repo except for those in .venv or tools directories. +MD_FILES := $(shell find . -type f -name "*.md" ! -path "*/.venv/*" ! -path "*/tools/*") GIGL_ALERT_EMAILS?="" - get_ver_hash: # Fetches the git commit hash and stores it in `$GIT_COMMIT` git diff --quiet || { echo Branch is dirty, please commit changes and ensure branch is clean; exit 1; } $(eval GIT_COMMIT=$(shell git log -1 --pretty=format:"%H")) -initialize_environment: - conda create -y --override-channels --channel conda-forge --name ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pip=${PIP_VERSION} pip-tools=${PIP_TOOLS_VERSION} - @echo "If conda environment was successfully installed, ensure to activate it and run \`make install_dev_deps\` or \`make install_deps\` to complete setup" - -clean_environment: - if [ "${CONDA_DEFAULT_ENV}" == "${CONDA_ENV_NAME}" ]; then \ - pip uninstall -y -r <(pip freeze); \ - else \ - echo Change your local env to dev first.; \ - fi - -reset_environment: generate_cpu_hashed_requirements clean_environment install_deps - -rebuild_dev_environment: - conda deactivate - conda remove --name ${CONDA_ENV_NAME} --all -y - make initialize_environment - conda activate ${CONDA_ENV_NAME} - make install_dev_deps - check_if_valid_env: - #@command -v docker >/dev/null 2>&1 || { echo >&2 "docker is required but it's not installed. Aborting."; exit 1; } + @command -v docker >/dev/null 2>&1 || { echo >&2 "docker is required but it's not installed. Aborting."; exit 1; } @command -v gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but it's not installed. Aborting."; exit 1; } - @python --version | grep -q "Python ${PYTHON_VERSION}" || (echo "Python version is not 3.9" && exit 1) - # if developing, you need to install dev deps instead install_dev_deps: check_if_valid_env gcloud auth configure-docker us-central1-docker.pkg.dev bash ./requirements/install_py_deps.sh --dev bash ./requirements/install_scala_deps.sh - pip install -e ./python/ - pre-commit install --hook-type pre-commit --hook-type pre-push - + uv pip install -e . + uv run pre-commit install --hook-type pre-commit --hook-type pre-push # Production environments, if you are developing use `make install_dev_deps` instead install_deps: gcloud auth configure-docker us-central1-docker.pkg.dev bash ./requirements/install_py_deps.sh bash ./requirements/install_scala_deps.sh - pip install -e ./python/ - -# Can only be run on an arm64 mac, otherwise generated hashed req file will be wrong -generate_mac_arm64_cpu_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/darwin_arm64_requirements_unified.txt \ - --extra torch25-cpu --extra transform --extra experimental \ - ./python/pyproject.toml - -# Can only be run on an arm64 mac, otherwise generated hashed req file will be wrong. -generate_dev_mac_arm64_cpu_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/dev_darwin_arm64_requirements_unified.txt \ - --extra torch25-cpu --extra transform --extra dev --extra experimental \ - ./python/pyproject.toml - -# Can only be run on linux, otherwise generated hashed req file will be wrong. -generate_linux_cpu_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/linux_cpu_requirements_unified.txt \ - --extra torch25-cpu --extra transform --extra experimental \ - ./python/pyproject.toml - -# Can only be run on linux, otherwise generated hashed req file will be wrong. -generate_dev_linux_cpu_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/dev_linux_cpu_requirements_unified.txt \ - --extra torch25-cpu --extra transform --extra dev --extra experimental \ - ./python/pyproject.toml - -# Can only be run on linux, otherwise generated hashed req file will be wrong. -generate_linux_cuda_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/linux_cuda_requirements_unified.txt \ - --extra torch25-cuda-121 --extra transform --extra experimental \ - ./python/pyproject.toml - -# Can only be run on linux, otherwise generated hashed req file will be wrong. -generate_dev_linux_cuda_hashed_requirements: - pip-compile -v --allow-unsafe --generate-hashes --no-emit-index-url --resolver=backtracking \ - --output-file=requirements/dev_linux_cuda_requirements_unified.txt \ - --extra torch25-cuda-121 --extra transform --extra dev --extra experimental \ - ./python/pyproject.toml + uv pip install -e . # These are a collection of tests that are run before anything is installed using tools available on host. # May include tests that check the sanity of the repo state i.e. ones that may even cause the failure of # installation scripts precondition_tests: - python testing/dep_vars_check.py - + uv run python testing/dep_vars_check.py run_api_test: cd testing/api_test && make run_api_test - assert_yaml_configs_parse: - python testing/assert_yaml_configs_parse.py -d . + uv run python testing/assert_yaml_configs_parse.py -d . # Set PY_TEST_FILES= to test a specifc file. # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` @@ -159,7 +77,7 @@ assert_yaml_configs_parse: # See the help text for "--test_file_pattern" in python/tests/test_args.py for more details. unit_test_py: clean_build_files_py type_check ( cd python ; \ - python -m tests.unit.main \ + uv run python -m tests.unit.main \ --env=test \ --resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \ --test_file_pattern=$(PY_TEST_FILES) \ @@ -180,9 +98,9 @@ unit_test_scala: clean_build_files_scala unit_test: precondition_tests unit_test_py unit_test_scala check_format_py: - autoflake --check --config python/pyproject.toml ${PYTHON_DIRS} - isort --check-only --settings-path=python/pyproject.toml ${PYTHON_DIRS} - black --check --config=python/pyproject.toml ${PYTHON_DIRS} + uv run autoflake --check --config pyproject.toml ${PYTHON_DIRS} + uv run isort --check-only --settings-path=pyproject.toml ${PYTHON_DIRS} + uv run black --check --config=pyproject.toml ${PYTHON_DIRS} check_format_scala: ( cd scala; sbt "scalafmtCheckAll; scalafixAll --check"; ) @@ -190,12 +108,10 @@ check_format_scala: check_format_md: @echo "Checking markdown files..." - mdformat --check ${MD_FILES} + uv run mdformat --check ${MD_FILES} check_format: check_format_py check_format_scala check_format_md - - # Set PY_TEST_FILES= to test a specifc file. # Ex. `make integration_test PY_TEST_FILES="dataflow_test.py"` # By default, runs all tests under python/tests/integration. @@ -203,7 +119,7 @@ check_format: check_format_py check_format_scala check_format_md integration_test: ( \ cd python ;\ - python -m tests.integration.main \ + uv run python -m tests.integration.main \ --env=test \ --resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \ --test_file_pattern=$(PY_TEST_FILES) \ @@ -213,12 +129,12 @@ notebooks_test: RESOURCE_CONFIG_PATH=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} python -m testing.notebooks_test mock_assets: - ( cd python ; python -m gigl.src.mocking.dataset_asset_mocking_suite --resource_config_uri="deployment/configs/e2e_cicd_resource_config.yaml" --env test) + ( cd python ; uv run python -m gigl.src.mocking.dataset_asset_mocking_suite --resource_config_uri="deployment/configs/e2e_cicd_resource_config.yaml" --env test) format_py: - autoflake --config python/pyproject.toml ${PYTHON_DIRS} - isort --settings-path=python/pyproject.toml ${PYTHON_DIRS} - black --config=python/pyproject.toml ${PYTHON_DIRS} + uv run autoflake --config pyproject.toml ${PYTHON_DIRS} + uv run isort --settings-path=pyproject.toml ${PYTHON_DIRS} + uv run black --config=pyproject.toml ${PYTHON_DIRS} format_scala: # We run "clean" before the formatting because otherwise some "scalafix.sbt.ScalafixFailed: NoFilesError" may get thrown after switching branches... @@ -228,18 +144,20 @@ format_scala: format_md: @echo "Formatting markdown files..." - mdformat ${MD_FILES} + uv run mdformat ${MD_FILES} format: format_py format_scala format_md - type_check: - mypy ${PYTHON_DIRS} --check-untyped-defs + uv run mypy ${PYTHON_DIRS} --check-untyped-defs + +lint_test: check_format assert_yaml_configs_parse + @echo "Lint checks pass!" # compiles current working state of scala projects to local jars compile_jars: @echo "Compiling jars..." - @python -m scripts.scala_packager + @uv run python -m scripts.scala_packager # Removes local jar files from python/deps directory remove_jars: @@ -247,13 +165,13 @@ remove_jars: rm -rf python/deps/scala/subgraph_sampler/jars/* push_cpu_docker_image: - @python -m scripts.build_and_push_docker_image --predefined_type cpu --image_name ${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG} + @uv run python -m scripts.build_and_push_docker_image --predefined_type cpu --image_name ${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG} push_cuda_docker_image: - @python -m scripts.build_and_push_docker_image --predefined_type cuda --image_name ${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} + @uv run python -m scripts.build_and_push_docker_image --predefined_type cuda --image_name ${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} push_dataflow_docker_image: - @python -m scripts.build_and_push_docker_image --predefined_type dataflow --image_name ${DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG} + @uv run python -m scripts.build_and_push_docker_image --predefined_type dataflow --image_name ${DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG} push_new_docker_images: push_cuda_docker_image push_cpu_docker_image push_dataflow_docker_image # Dockerize the src code and push it to gcr. @@ -267,15 +185,14 @@ push_new_docker_images: push_cuda_docker_image push_cpu_docker_image push_datafl @echo "All Docker images compiled and pushed" push_dev_workbench_docker_image: compile_jars - @python -m scripts.build_and_push_docker_image --predefined_type=dev_workbench --image_name=${DEFAULT_GIGL_RELEASE_DEV_WORKBENCH_IMAGE} - + @uv run python -m scripts.build_and_push_docker_image --predefined_type=dev_workbench --image_name=${DEFAULT_GIGL_RELEASE_DEV_WORKBENCH_IMAGE} # Set compiled_pipeline path so compile_gigl_kubeflow_pipeline knows where to save the pipeline to so # that the e2e test can use it. run_cora_nalp_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_cora_nalp_e2e_test: compile_gigl_kubeflow_pipeline run_cora_nalp_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="cora_nalp_test" @@ -283,7 +200,7 @@ run_cora_nalp_e2e_test: run_cora_snc_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_cora_snc_e2e_test: compile_gigl_kubeflow_pipeline run_cora_snc_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="cora_snc_test" @@ -291,7 +208,7 @@ run_cora_snc_e2e_test: run_cora_udl_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_cora_udl_e2e_test: compile_gigl_kubeflow_pipeline run_cora_udl_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="cora_udl_test" @@ -299,7 +216,7 @@ run_cora_udl_e2e_test: run_dblp_nalp_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_dblp_nalp_e2e_test: compile_gigl_kubeflow_pipeline run_dblp_nalp_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="dblp_nalp_test" @@ -307,7 +224,7 @@ run_dblp_nalp_e2e_test: run_hom_cora_sup_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_hom_cora_sup_e2e_test: compile_gigl_kubeflow_pipeline run_hom_cora_sup_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="hom_cora_sup_test" @@ -315,7 +232,7 @@ run_hom_cora_sup_e2e_test: run_het_dblp_sup_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_het_dblp_sup_e2e_test: compile_gigl_kubeflow_pipeline run_het_dblp_sup_e2e_test: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" \ --test_names="het_dblp_sup_test" @@ -323,18 +240,17 @@ run_het_dblp_sup_e2e_test: run_all_e2e_tests: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH} run_all_e2e_tests: compile_gigl_kubeflow_pipeline run_all_e2e_tests: - python testing/e2e_tests/e2e_test.py \ + uv run python testing/e2e_tests/e2e_test.py \ --compiled_pipeline_path=$(compiled_pipeline_path) \ --test_spec_uri="testing/e2e_tests/e2e_tests.yaml" - # Compile an instance of a kfp pipeline # If you want to compile a pipeline and save it to a specific path, set compiled_pipeline_path # Example: # `make compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" compile_gigl_kubeflow_pipeline` # Can be a GCS URI as well compile_gigl_kubeflow_pipeline: compile_jars push_new_docker_images - python -m gigl.orchestration.kubeflow.runner \ + uv run python -m gigl.orchestration.kubeflow.runner \ --action=compile \ --container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} \ --container_image_cpu=${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG} \ @@ -360,7 +276,7 @@ _skip_build_deps: # compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" \ # run_dev_gnn_kubeflow_pipeline run_dev_gnn_kubeflow_pipeline: $(if $(compiled_pipeline_path), _skip_build_deps, compile_jars push_new_docker_images) - python -m gigl.orchestration.kubeflow.runner \ + uv run python -m gigl.orchestration.kubeflow.runner \ $(if $(compiled_pipeline_path),,--container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG}) \ $(if $(compiled_pipeline_path),,--container_image_cpu=${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG}) \ $(if $(compiled_pipeline_path),,--container_image_dataflow=${DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG}) \ @@ -422,5 +338,5 @@ stop_toaster: docker buildx prune build_docs: - sphinx-build -M clean . gh_pages_build - sphinx-build -M html . gh_pages_build + uv run sphinx-build -M clean . gh_pages_build + uv run sphinx-build -M html . gh_pages_build diff --git a/containers/Dockerfile.builder b/containers/Dockerfile.builder index 9c1b08176..90820fa6f 100644 --- a/containers/Dockerfile.builder +++ b/containers/Dockerfile.builder @@ -3,7 +3,7 @@ # This dockerfile is contains all Dev dependencies, and is used by gcloud # builders for running tests, et al. -FROM condaforge/miniforge3:25.3.0-1 +FROM ubuntu:noble-20251001 SHELL ["/bin/bash", "-c"] @@ -21,12 +21,20 @@ RUN apt-get update && apt-get install && apt-get install -y \ cmake \ sudo \ build-essential \ + curl \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* - +# Dec 1, 2025 (svij-sc): +# GCP Cloud build agents run an older version of docker deamon +# with max Docker API version support of 1.41. https://docs.cloud.google.com/build/docs/overview#docker +# At the time of writing Docker Client > v28 has deprecated support for < v1.44. +# https://docs.docker.com/engine/release-notes/29/#breaking-changes +# Thus we use v28.5.2, and also manually set the API version to 1.41 to ensure compatibility. +ENV DOCKER_CLIENT_VERSION=28.5.2 +ENV DOCKER_API_VERSION=1.41 RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ - sh get-docker.sh && \ + sh get-docker.sh --version ${DOCKER_CLIENT_VERSION} && \ rm get-docker.sh # Install Google Cloud CLI @@ -39,26 +47,34 @@ RUN mkdir -p /tools && \ ENV PATH="/tools/google-cloud-sdk/bin:/usr/lib/jvm/java-1.11.0-openjdk-amd64/bin:$PATH" ENV JAVA_HOME="/usr/lib/jvm/java-1.11.0-openjdk-amd64" -# Create the environment: -# TODO: (svij) Build env using single entrypoint `make initialize_environment` for better maintainability -RUN conda create -y --override-channels --channel conda-forge --name gigl python=3.9 pip - -# Update path so any call for python executables in the built image defaults to using the gnn conda environment -ENV PATH=/opt/conda/envs/gigl/bin:$PATH -# For debugging purposes, we also initialize respective conda env in bashrc -RUN conda init bash -RUN echo "conda activate gigl" >> ~/.bashrc - +WORKDIR /gigl_deps # We copy the tools directory from the host machine to the container # to avoid re-downloading the dependencies as some of them require GCP credentials. # and, mounting GCP credentials to build time can be a pain and more prone to # accidental leaking of credentials. -COPY tools gigl_deps/tools -COPY dep_vars.env gigl_deps/dep_vars.env -COPY requirements gigl_deps/requirements -COPY python/gigl/scripts gigl_deps/python/gigl/scripts -RUN pip install --upgrade pip -RUN cd gigl_deps && bash ./requirements/install_py_deps.sh --no-pip-cache --dev -RUN cd gigl_deps && bash ./requirements/install_scala_deps.sh +COPY tools tools +COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock +COPY dep_vars.env dep_vars.env +COPY requirements requirements +# Needed to install GLT +COPY python/gigl/scripts python/gigl/scripts + + +COPY .python-version tmp/.python-version +RUN bash ./requirements/install_py_deps.sh --dev + +# The UV_PROJECT_ENVIRONMENT environment variable can be used to configure the project virtual environment path +# Since the above command should have created the .venv, we activate by default for any future uv commands. +# We also need to set VIRTUAL_ENV so pip envocations can find the virtual environment. +ENV UV_PROJECT_ENVIRONMENT=/gigl_deps/.venv +ENV VIRTUAL_ENV="${UV_PROJECT_ENVIRONMENT}" +# We just created a virtual environment, lets add the bin to the path +ENV PATH="${UV_PROJECT_ENVIRONMENT}/bin:${PATH}" +# We also need to make UV detectable by the system +ENV PATH="/root/.local/bin:${PATH}" +RUN bash ./requirements/install_scala_deps.sh + +WORKDIR / CMD [ "/bin/bash" ] diff --git a/containers/Dockerfile.cpu.base b/containers/Dockerfile.cpu.base index 8f798b3c9..d548fd69b 100644 --- a/containers/Dockerfile.cpu.base +++ b/containers/Dockerfile.cpu.base @@ -1,9 +1,11 @@ # syntax=docker/dockerfile:1 -FROM condaforge/miniforge3:25.3.0-1 +FROM ubuntu:noble-20251001 SHELL ["/bin/bash", "-c"] +ENV DEBIAN_FRONTEND=noninteractive + # TODO(mkolodner-sc): iputils-ping temporarily needed to setup inter-job VAI communication for GLT Inference. # Once VAI natively supports this communication, we can remove this requirement. RUN apt-get update && apt-get install -y \ @@ -12,23 +14,31 @@ RUN apt-get update && apt-get install -y \ wget \ cmake \ iputils-ping \ + curl \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Create the environment: -# TODO: (svij) Build env using single entrypoint `make initialize_environment` for better maintainability -RUN conda create -y --override-channels --channel conda-forge --name gnn python=3.9 pip -# Update path so any call for python executables in the built image defaults to using the gnn conda environment -ENV PATH=/opt/conda/envs/gnn/bin:$PATH +WORKDIR /gigl_deps + +COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock +COPY requirements requirements +COPY python/gigl/scripts python/gigl/scripts +COPY .python-version .python-version + +RUN bash ./requirements/install_py_deps.sh -# For debugging purposes, we also initialize respective conda env in bashrc -RUN conda init bash -RUN echo "conda activate gnn" >> ~/.bashrc +# The UV_PROJECT_ENVIRONMENT environment variable can be used to configure the project virtual environment path +# Since the above command should have created the .venv, we activate by default for any future uv commands. +# We also need to set VIRTUAL_ENV so pip envocations can find the virtual environment. +ENV UV_PROJECT_ENVIRONMENT=/gigl_deps/.venv +ENV VIRTUAL_ENV="${UV_PROJECT_ENVIRONMENT}" +# We just created a virtual environment, lets add the bin to the path +ENV PATH="${UV_PROJECT_ENVIRONMENT}/bin:${PATH}" +# We also need to make UV detectable by the system +ENV PATH="/root/.local/bin:${PATH}" -COPY requirements tmp/requirements -COPY python/gigl/scripts tmp/python/gigl/scripts -RUN pip install --upgrade pip -RUN cd tmp && bash ./requirements/install_py_deps.sh --no-pip-cache +WORKDIR / CMD [ "/bin/bash" ] diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index 2e66148a7..df09e88bf 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -1,39 +1,42 @@ # syntax=docker/dockerfile:1 -# Used to generate hashed requirements.txt -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 + +FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel SHELL ["/bin/bash", "-c"] +ENV DEBIAN_FRONTEND=noninteractive + +# Already has python 3.11 installed - no need to install it again. +# We use system python since it has packages pre-installed for us. +ENV UV_SYSTEM_PYTHON=true +ENV UV_PROJECT_ENVIRONMENT=/opt/conda/ # Install basic dependencies # TODO(mkolodner-sc): iputils-ping temporarily needed to setup inter-job VAI communication for GLT Inference. # Once VAI natively supports this communication, we can remove this requirement. -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y build-essential git wget cmake iputils-ping \ +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + wget \ + cmake \ + iputils-ping \ + curl \ + unzip \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install Miniconda -ENV CONDA_DIR=/opt/conda -RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && bash Miniforge3.sh -b -p ${CONDA_DIR} -ENV PATH=${CONDA_DIR}/bin:$PATH +WORKDIR /gigl_deps -# Create the conda env environment: -# TODO: (svij) Build env using single entrypoint `make initialize_environment` for better maintainability -RUN conda create -y --override-channels --channel conda-forge --name gnn python=3.9 pip +COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock +COPY requirements requirements +COPY python/gigl/scripts python/gigl/scripts -# Update path so any call for python executables in the built image defaults to using the gnn conda environment -ENV PATH=${CONDA_DIR}/envs/gnn/bin:$PATH +RUN bash ./requirements/install_py_deps.sh -# The default bashrc exists early and is mainly for local dev niceties - we delete so we can -# Setup Conda for debugging purposes -RUN rm ~/.bashrc && touch ~/.bashrc && conda init bash -RUN echo "conda activate gnn" >> ~/.bashrc +# We also need to make UV detectable by the system +ENV PATH="/root/.local/bin:${PATH}" -COPY requirements tmp/requirements -COPY python/gigl/scripts tmp/python/gigl/scripts -RUN pip install --upgrade pip -RUN cd tmp && bash ./requirements/install_py_deps.sh --no-pip-cache +WORKDIR / CMD [ "/bin/bash" ] diff --git a/containers/Dockerfile.dataflow.base b/containers/Dockerfile.dataflow.base index 5e135092e..434985331 100644 --- a/containers/Dockerfile.dataflow.base +++ b/containers/Dockerfile.dataflow.base @@ -1,5 +1,34 @@ -# Use the main Dockerfile.cpu.base as the base -ARG BASE_IMAGE -FROM $BASE_IMAGE +FROM apache/beam_python3.11_sdk:2.56.0 -COPY --from=apache/beam_python3.9_sdk:2.53.0 /opt/apache/beam /opt/apache/beam +ENV DEBIAN_FRONTEND=noninteractive + +# We use system python for dataflow images since it has python and apache beam pre-installed. +ENV UV_SYSTEM_PYTHON=true +ENV UV_PROJECT_ENVIRONMENT=/usr/local + +# TODO(mkolodner-sc): iputils-ping temporarily needed to setup inter-job VAI communication for GLT Inference. +# Once VAI natively supports this communication, we can remove this requirement. +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + wget \ + cmake \ + iputils-ping \ + curl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +WORKDIR /gigl_deps + +COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock +COPY requirements requirements +COPY python/gigl/scripts python/gigl/scripts + +RUN bash ./requirements/install_py_deps.sh --skip-glt-post-install + +# We also need to make UV detectable by the system +ENV PATH="/root/.local/bin:${PATH}" + +WORKDIR / diff --git a/containers/Dockerfile.dataflow.src b/containers/Dockerfile.dataflow.src index b4d39e706..78fb6213e 100644 --- a/containers/Dockerfile.dataflow.src +++ b/containers/Dockerfile.dataflow.src @@ -2,30 +2,17 @@ ARG BASE_IMAGE FROM $BASE_IMAGE -# Ensure same as deployment/containers/Dockerfile.dataflow.src ================================================== # Copy the source WORKDIR /gigl -RUN touch __init__.py - -# Note: main package files must live in root of the repo for the python package to be built correctly for Dataflow workers. -# See https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#create-reproducible-environments. COPY MANIFEST.in MANIFEST.in -COPY python/setup.py setup.py COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock COPY dep_vars.env dep_vars.env COPY deployment deployment -COPY python/snapchat snapchat +COPY python python COPY examples examples -COPY python/gigl gigl - -# enables usage of tcm as the memory allocator instead of default C memory allocators. Mainly, advantageous for CPU training jobs -# Either boosts performance or does not make any improvement compared to default settings. -# PyTorch recommendation: https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#switch-memory-allocator -# Replace `libtcmalloc` with `libjemalloc` if you want to try jem memory allocator -ENV LD_PRELOAD /opt/conda/envs/gnn/lib/libtcmalloc.so:$LD_PRELOAD -# ================================================================================================================= +RUN uv pip install -e . -# Set the entrypoint to Apache Beam SDK launcher. -ENTRYPOINT ["/opt/apache/beam/boot"] +WORKDIR / diff --git a/containers/Dockerfile.src b/containers/Dockerfile.src index 089a58996..911c3d6b7 100644 --- a/containers/Dockerfile.src +++ b/containers/Dockerfile.src @@ -4,22 +4,17 @@ FROM $BASE_IMAGE # Copy the source WORKDIR /gigl -RUN touch __init__.py # Note: main package files must live in root of the repo for the python package to be built correctly for Dataflow workers. -# See https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#create-reproducible-environments. +# See https://beam.apache.org/documentation/sdks/python-pipxeline-dependencies/#create-reproducible-environments. +WORKDIR /gigl + COPY MANIFEST.in MANIFEST.in -COPY python/setup.py setup.py COPY pyproject.toml pyproject.toml +COPY uv.lock uv.lock COPY dep_vars.env dep_vars.env COPY deployment deployment -COPY python/snapchat snapchat +COPY python python COPY examples examples -COPY python/gigl gigl -# enables usage of tcm as the memory allocator instead of default C memory allocators. Mainly, advantageous for CPU training jobs -# Either boosts performance or does not make any improvement compared to default settings. -# PyTorch recommendation: https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#switch-memory-allocator -# Replace `libtcmalloc` with `libjemalloc` if you want to try jem memory allocator -# nshah: This creates huge memory overhead with graphlearn-torch usage. Disabling this memory allocator. -# ENV LD_PRELOAD /opt/conda/envs/gnn/lib/libtcmalloc.so:$LD_PRELOAD +RUN uv pip install -e . diff --git a/dep_vars.env b/dep_vars.env index 09af0820d..784ed142f 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 diff --git a/deployment/configs/e2e_cicd_resource_config.yaml b/deployment/configs/e2e_cicd_resource_config.yaml index ec6fa9540..21fe6ab0a 100644 --- a/deployment/configs/e2e_cicd_resource_config.yaml +++ b/deployment/configs/e2e_cicd_resource_config.yaml @@ -44,7 +44,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/deployment/configs/unittest_resource_config.yaml b/deployment/configs/unittest_resource_config.yaml index 2b6666ec3..c9335d81d 100644 --- a/deployment/configs/unittest_resource_config.yaml +++ b/deployment/configs/unittest_resource_config.yaml @@ -46,7 +46,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training + gpu_type: NVIDIA_TESLA_T4 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/docs/examples/configs/template_resource_config.yaml b/docs/examples/configs/template_resource_config.yaml index f8335b501..f87d315f8 100644 --- a/docs/examples/configs/template_resource_config.yaml +++ b/docs/examples/configs/template_resource_config.yaml @@ -31,11 +31,11 @@ split_generator_config: # Dataproc config trainer_config: vertex_ai_trainer_config: # or local_trainer_config machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 2 inferencer_config: num_workers: 1 max_num_workers: 256 machine_type: "c3-standard-22" - disk_size_gb: 100 \ No newline at end of file + disk_size_gb: 100 diff --git a/docs/user_guide/config_guides/resource_config_guide.md b/docs/user_guide/config_guides/resource_config_guide.md index d909650c4..decc23df1 100644 --- a/docs/user_guide/config_guides/resource_config_guide.md +++ b/docs/user_guide/config_guides/resource_config_guide.md @@ -54,7 +54,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "" # e.g. n1-highmem-16 - gpu_type: "" # e.g. nvidia-tesla-p100 + gpu_type: "" # e.g. NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 1 inferencer_config: diff --git a/docs/user_guide/getting_started/installation.md b/docs/user_guide/getting_started/installation.md index 16f0ed69f..1082d274b 100644 --- a/docs/user_guide/getting_started/installation.md +++ b/docs/user_guide/getting_started/installation.md @@ -41,9 +41,7 @@ Below we provide two ways to bootstrap an environment for using and/or developin 1. If on MAC, Install [Homebrew](https://brew.sh/). - 2. Install [Conda](https://github.com/conda-forge/miniforge?tab=readme-ov-file#install): - - 3. Install [Docker](https://docs.docker.com/desktop/) and the relevant `buildx` drivers (if using old versions of docker): + 2. Install [Docker](https://docs.docker.com/desktop/) and the relevant `buildx` drivers (if using old versions of docker): Once installed, ensure you can run multiarch docker builds by running following command: @@ -145,15 +143,6 @@ git clone https://github.com/Snapchat/GiGL.git From the root directory: -```bash -make initialize_environment -conda activate gnn -``` - -This creates a Python 3.9 environment with some basic utilities. Next, to install all user dependencies. Note: The -command below will try its best ot infer your environment and install necessary reqs i.e. if CUDA is available it will -try to install the necessary gpu deps, otherwise it will install cpu deps. - ```bash make install_deps ``` diff --git a/mypy.ini b/mypy.ini index 7259770b1..d488c2a83 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,6 @@ # Global options: [mypy] -python_version = 3.9 +python_version = 3.11 # Ignore modules that don't have any existing stubs diff --git a/python/gigl/common/utils/compute/serialization/serialize_protos.py b/python/gigl/common/utils/compute/serialization/serialize_protos.py index dac0ed7d2..97bfb09cb 100644 --- a/python/gigl/common/utils/compute/serialization/serialize_protos.py +++ b/python/gigl/common/utils/compute/serialization/serialize_protos.py @@ -8,10 +8,10 @@ from snapchat.research.gbml import graph_schema_pb2 """ -In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only -need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, -therefore can be deterministic. Which is specially important when shuffling with proto wrapper -objects as key. +In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only +need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, +therefore can be deterministic. Which is specially important when shuffling with proto wrapper +objects as key. """ diff --git a/python/gigl/distributed/utils/networking.py b/python/gigl/distributed/utils/networking.py index cf733c4e3..7d2ba46b9 100644 --- a/python/gigl/distributed/utils/networking.py +++ b/python/gigl/distributed/utils/networking.py @@ -155,7 +155,7 @@ def get_internal_ip_from_node( # Other nodes will receive the master's IP via broadcast ip_list = [None] - device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.distributed.broadcast_object_list(ip_list, src=node_rank, device=device) node_ip = ip_list[0] logger.info(f"Rank {rank} received master node's internal IP: {node_ip}") diff --git a/python/gigl/nn/models.py b/python/gigl/nn/models.py index 9fa29f62e..15adde632 100644 --- a/python/gigl/nn/models.py +++ b/python/gigl/nn/models.py @@ -397,16 +397,16 @@ def _weighted_layer_sum( Returns: torch.Tensor: Weighted sum of all layer embeddings, shape [N, D]. """ - if len(all_layer_embeddings) != len(self._layer_weights): + if len(all_layer_embeddings) != len(self._layer_weights): # type: ignore # https://github.com/Snapchat/GiGL/issues/408 raise ValueError( - f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." + f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." # type: ignore # https://github.com/Snapchat/GiGL/issues/408 ) # Stack all layer embeddings and compute weighted sum # _layer_weights is already a tensor buffer registered in __init__ stacked = torch.stack(all_layer_embeddings, dim=0) # shape [K+1, N, D] w = self._layer_weights.to(stacked.device) # shape [K+1], ensure on same device - out = (stacked * w.view(-1, 1, 1)).sum( + out = (stacked * w.view(-1, 1, 1)).sum( # type: ignore # https://github.com/Snapchat/GiGL/issues/408 dim=0 ) # shape [N, D], w_0*X_0 + w_1*X_1 + ... diff --git a/python/gigl/scripts/install_glt.sh b/python/gigl/scripts/install_glt.sh index 0ef8c4b6a..f044c6d0c 100755 --- a/python/gigl/scripts/install_glt.sh +++ b/python/gigl/scripts/install_glt.sh @@ -52,12 +52,24 @@ then if has_cuda_driver; then echo "Will use CUDA for GLT..." + + # Potential values for TORCH_CUDA_ARCH_LIST: (not all are tested) + # 6.0 = Pascal support i.e. Tesla P100 - CUDA 8 or later + # 6.1 = Pascal support i.e. Tesla P4 - CUDA 8 or later + # 7.0 = Volta support i.e. Tesla V100 - CUDA 9 or later + # 7.5 = Turing support i.e. Tesla T4 - CUDA 10 or later + # 8.0 = Ampere support i.e. A100 - CUDA 11 or later + # 8.9 = Ada Lovelace support i.e. L4 - CUDA 11.8 or later + # 9.0 = Hopper support i.e. H100 , H200 - CUDA 12.0 or later + # 10.0 = Blackwell support i.e. B200 - CUDA 12.6 or later + # 12.0 = Blackwell support i.e. RTX6000 - CUDA 12.8 or later + # List of Nvidia GPUS: https://developer.nvidia.com/cuda-gpus TORCH_CUDA_ARCH_LIST="7.5" WITH_CUDA="ON" python setup.py bdist_wheel else echo "Will use CPU for GLT..." WITH_CUDA="OFF" python setup.py bdist_wheel fi - pip install dist/*.whl \ + uv pip install dist/*.whl \ && cd .. \ && rm -rf graphlearn-for-pytorch else diff --git a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py index c91532cb2..9cae41eb8 100644 --- a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py +++ b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py @@ -174,9 +174,12 @@ def train( early_stop_counter = 0 best_val_loss = float("inf") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_training_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) @@ -411,9 +414,12 @@ def eval( logger.info("Start testing...") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_test_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) diff --git a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py index 5fa98ca95..e8c8c54d2 100644 --- a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py +++ b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py @@ -200,7 +200,7 @@ def score( assert root_node_labels is not None results: InferBatchResults = self.infer_batch(batch=batch, device=device) - num_correct_in_batch = int((results.predictions == root_node_labels).sum()) + num_correct_in_batch = int((results.predictions == root_node_labels).sum()) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 num_correct += num_correct_in_batch num_evaluated += len(batch.root_node_labels) diff --git a/python/gigl/src/common/modeling_task_specs/utils/infer.py b/python/gigl/src/common/modeling_task_specs/utils/infer.py index 0222feb28..13804bea8 100644 --- a/python/gigl/src/common/modeling_task_specs/utils/infer.py +++ b/python/gigl/src/common/modeling_task_specs/utils/infer.py @@ -139,8 +139,8 @@ def infer_task_inputs( decoder = model.module.decode batch_result_types = model.module.tasks.result_types else: - decoder = model.decode - batch_result_types = model.tasks.result_types + decoder = model.decode # type: ignore # https://github.com/Snapchat/GiGL/issues/408 + batch_result_types = model.tasks.result_types # type: ignore # https://github.com/Snapchat/GiGL/issues/408 # If we only have losses which only require the input batch, don't forward here and return the # input batch immediately to minimize computation we don't need, such as encoding and decoding. diff --git a/python/gigl/src/common/models/layers/feature_interaction.py b/python/gigl/src/common/models/layers/feature_interaction.py index aa7ad737f..afa025365 100644 --- a/python/gigl/src/common/models/layers/feature_interaction.py +++ b/python/gigl/src/common/models/layers/feature_interaction.py @@ -149,7 +149,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def reset_parameters(self): for layer in self._layers: - layer.reset_parameters() + if hasattr(layer, "reset_parameters") and callable(layer.reset_parameters): + layer.reset_parameters() def __repr__(self) -> str: return f"{self.__class__.__name__}(in_dim={self._in_dim}, num_layers={self._num_layers}, projection_dim={self._projection_dim}, diag_scale={self._diag_scale}, use_bias={self._use_bias})" diff --git a/python/gigl/src/common/models/layers/loss.py b/python/gigl/src/common/models/layers/loss.py index 958e0304a..b03c150e5 100644 --- a/python/gigl/src/common/models/layers/loss.py +++ b/python/gigl/src/common/models/layers/loss.py @@ -142,7 +142,7 @@ def _calculate_softmax_loss( ) # shape=[num_pos_nodes] loss = F.cross_entropy( - input=all_scores / self.softmax_temperature, + input=all_scores / self.softmax_temperature, # type: ignore # https://github.com/Snapchat/GiGL/issues/408 target=ys, reduction="sum", ) diff --git a/python/gigl/src/common/models/layers/task.py b/python/gigl/src/common/models/layers/task.py index 35f00aec2..b82dee44b 100644 --- a/python/gigl/src/common/models/layers/task.py +++ b/python/gigl/src/common/models/layers/task.py @@ -709,7 +709,7 @@ def _get_all_tasks( for task in list(self._task_to_weights_map.keys()): fn = self._task_to_fn_map[task] weight = self._task_to_weights_map[task] - tasks_list.append((fn, weight)) + tasks_list.append((fn, weight)) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 return tasks_list def add_task( diff --git a/python/gigl/src/mocking/lib/pyg_datasets_forks.py b/python/gigl/src/mocking/lib/pyg_datasets_forks.py index de026b61d..e83abfe0c 100644 --- a/python/gigl/src/mocking/lib/pyg_datasets_forks.py +++ b/python/gigl/src/mocking/lib/pyg_datasets_forks.py @@ -1,7 +1,7 @@ """ Our mocking logic uses public datasets like Cora and DBLP from PyG. PyG datasets are -downloaded from public sources which may not be available or rate-limit us. We thus -override the dataset classes to download the datasets from GCS buckets to avoid issues. +downloaded from public sources which may not be available or rate-limit us. We thus +override the dataset classes to download the datasets from GCS buckets to avoid issues. """ from torch_geometric.data import extract_zip diff --git a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py index 1f38ac487..880821876 100644 --- a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py +++ b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py @@ -94,7 +94,7 @@ def test_early_stopping( for step_num, value in enumerate(mocked_criteria_values): has_metric_improved, should_early_stop = early_stopper.step(value=value) if model is not None: - model.foo += 1 + model.foo += 1 # type: ignore # https://github.com/Snapchat/GiGL/issues/408 if step_num in improvement_steps: self.assertTrue(has_metric_improved) else: diff --git a/requirements/install_py_deps.sh b/requirements/install_py_deps.sh index 7b881ee56..1a18ece28 100644 --- a/requirements/install_py_deps.sh +++ b/requirements/install_py_deps.sh @@ -3,8 +3,10 @@ set -e set -x DEV=0 # Flag to install dev dependencies. -PIP_ARGS="--no-deps" # We don't want to install dependencies when installing packages from hashed requirements files. -PIP_CREDENTIALS_MOUNTED=0 # When running this script in Docker environments, we may wish to mount pip credentials to install packages from a private repository. +# Flag to skip installing GiGL lib dependencies, i.e. only dev tools will be installed if DEV=1. +SKIP_GIGL_LIB_DEPS_INSTALL=0 +SKIP_GLT_POST_INSTALL=0 # Flag to skip GLT post install. if SKIP_GIGL_LIB_DEPS_INSTALL=1, overrides SKIP_GLT_POST_INSTALL to =1. + for arg in "$@" do @@ -13,32 +15,18 @@ do DEV=1 shift ;; - --no-pip-cache) - PIP_ARGS+=" --no-cache-dir" + --skip-gigl-lib-deps-install) + SKIP_GIGL_LIB_DEPS_INSTALL=1 shift ;; - --mount-pip-credentials) - PIP_CREDENTIALS_MOUNTED=1 + --skip-glt-post-install) + SKIP_GLT_POST_INSTALL=1 shift ;; esac done -REQ_FILE_PREFIX="" -if [[ $DEV -eq 1 ]] -then - echo "Recognized '--dev' flag is set. Will also install dev dependencies." - REQ_FILE_PREFIX="dev_" -fi - -if [[ $PIP_CREDENTIALS_MOUNTED -eq 1 ]] -then - echo "Recognized '--mount-pip-credentials' flag is set. Will use the mounted pip credentials (expected at /root/.pip/pip.conf)." - cp /root/.pip/pip.conf /etc/pip.conf - echo "Contents of /etc/pip.conf:" - cat /etc/pip.conf -fi - +### Helper functions ### has_cuda_driver() { # Use the whereis command to locate the CUDA driver cuda_location=$(whereis cuda) @@ -63,38 +51,30 @@ is_running_on_m1_mac() { return $? } -pip install --upgrade pip - -if is_running_on_mac; -then - echo "Setting up Mac CPU environment" - req_file="requirements/${REQ_FILE_PREFIX}darwin_arm64_requirements_unified.txt" -else - if has_cuda_driver; +### Installation Functions ### +install_uv_if_needed() { + # We use the uv package manager + # Check if uv is already installed + if ! command -v uv &> /dev/null then - echo "Setting up Linux CUDA environment" - req_file="requirements/${REQ_FILE_PREFIX}linux_cuda_requirements_unified.txt" - else - echo "Setting up Linux CPU environment" - req_file="requirements/${REQ_FILE_PREFIX}linux_cpu_requirements_unified.txt" + echo "uv could not be found. Installing uv..." + EXPECTED_SHA256="8402ab80d2ef54d7044a71ea4e4e1e8db3b20c87c7bffbc30bff59f1e80ebbd5" + curl -LsSf -o uv_installer.sh https://astral.sh/uv/0.9.5/install.sh # Matches the version in .github/actions/setup-python-tools/action.yml + + # Verify SHA256 checksum - script will exit if this fails due to set -e + if ! echo "$EXPECTED_SHA256 uv_installer.sh" | sha256sum -c -; then + echo "ERROR: SHA256 checksum verification failed for uv installer!" >&2 + rm -f uv_installer.sh + exit 1 + fi + + sh uv_installer.sh + rm -f uv_installer.sh + source $HOME/.local/bin/env fi -fi - -echo "Installing from ${req_file}" -pip install -r $req_file $PIP_ARGS - - -# Taken from https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script -# We do this so if `install_py_deps.sh` is run from a different directory, the script can still find the post_install.py file. -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -python $SCRIPT_DIR/../python/gigl/scripts/post_install.py - -# TODO: (svij) Check if gperftools is still needed -# https://github.com/Snapchat/GiGL/issues/296 -conda install --override-channels --channel conda-forge gperftools # tcmalloc, ref: https://google.github.io/tcmalloc/overview.html +} -if [[ $DEV -eq 1 ]] -then +install_dev_tools() { echo "Setting up required dev tooling" # Install tools needed to run spark/scala code mkdir -p tools/python_protoc @@ -102,7 +82,7 @@ then echo "Installing tooling for python protobuf" # https://github.com/protocolbuffers/protobuf/releases/tag/v3.19.6 # This version should be smaller than the one used by client i.e. the protobuf client version specified in - # common-requirements.txt file should be >= v3.19.6 + # pyproject.toml file should be >= v3.19.6 # TODO (svij-sc): update protoc + protobuff if is_running_on_mac; then @@ -113,13 +93,75 @@ then unzip -o tools/python_protoc/python_protoc_3_19_6.zip -d tools/python_protoc rm tools/python_protoc/python_protoc_3_19_6.zip -fi + echo "Finished setting up required dev tooling" +} + +install_gigl_lib_deps() { + echo "Installing GiGL lib" + extra_deps=("experimental" "transform") + if is_running_on_mac; + then + echo "Setting up Mac CPU environment" + extra_deps+=("pyg27-torch28-cpu") + else + if has_cuda_driver; + then + echo "Setting up Linux CUDA environment" + extra_deps+=("pyg27-torch28-cu128") + else + echo "Setting up Linux CPU environment" + extra_deps+=("pyg27-torch28-cpu") + fi + fi + + extra_deps_clause=() + for dep in "${extra_deps[@]}"; do + extra_deps_clause+=(--extra "$dep") + done + + flag_use_inexact_match="" + if [[ "${UV_SYSTEM_PYTHON}" == "true" ]] + then + echo "Recognized using system python." + echo "Will use inexact match for dependencies so we don't override system packages." + # Syncing is "exact" by default, which means it will remove any packages that are not present in the lockfile. + # To retain extraneous packages, use the --inexact option: + # https://docs.astral.sh/uv/concepts/projects/sync/#retaining-extraneous-packages + # This is useful for example when we might have packages pre-installed i.e. torch, pyg, etc. + flag_use_inexact_match="--inexact" + fi -if [[ $PIP_CREDENTIALS_MOUNTED -eq 1 ]] -then - echo "Removing mounted pip credentials." - rm /etc/pip.conf -fi + if [[ $DEV -eq 1 ]] + then + # https://docs.astral.sh/uv/reference/cli/#uv-sync + uv sync ${extra_deps_clause[@]} --group dev --locked ${flag_use_inexact_match} + else + uv sync ${extra_deps_clause[@]} --locked ${flag_use_inexact_match} + fi + + # Taken from https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script + # We do this so if `install_py_deps.sh` is run from a different directory, the script can still find the post_install.py file. + if [[ "${SKIP_GLT_POST_INSTALL}" -eq 0 ]] + then + SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + uv run python $SCRIPT_DIR/../python/gigl/scripts/post_install.py + fi +} + +### Main Script ### +main() { + install_uv_if_needed + + if [[ $DEV -eq 1 ]] + then + install_dev_tools + fi + + if [[ $SKIP_GIGL_LIB_DEPS_INSTALL -eq 0 ]] + then + install_gigl_lib_deps + fi +} -conda clean -afy +main echo "Finished installation" diff --git a/testing/e2e_tests/e2e_tests.yaml b/testing/e2e_tests/e2e_tests.yaml index b084b9479..44b4445f0 100644 --- a/testing/e2e_tests/e2e_tests.yaml +++ b/testing/e2e_tests/e2e_tests.yaml @@ -2,16 +2,16 @@ # This file contains all the test specifications that can be run via the e2e test script tests: cora_nalp_test: - task_config_uri: "gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_snc_test: - task_config_uri: "gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_udl_test: - task_config_uri: "gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" dblp_nalp_test: - task_config_uri: "gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" hom_cora_sup_test: task_config_uri: "examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml" diff --git a/uv.lock b/uv.lock index e3df8684b..10f25efd8 100644 --- a/uv.lock +++ b/uv.lock @@ -893,7 +893,7 @@ requires-dist = [ { name = "numpy" }, { name = "omegaconf", specifier = ">=2.3.0,<3.0.0" }, { name = "pandas" }, - { name = "pip" }, + { name = "pip", specifier = "~=25.3" }, { name = "protobuf" }, { name = "pyarrow", marker = "extra == 'transform'", specifier = "==10.0.1" }, { name = "pyg-lib", marker = "sys_platform != 'darwin' and extra == 'pyg27-torch28-cpu'", index = "https://data.pyg.org/whl/torch-2.8.0+cpu.html", conflict = { package = "gigl", extra = "pyg27-torch28-cpu" } },