From 465ba3c685bd924967b5fad868a7f2ce6625f330 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 05:24:03 +0000 Subject: [PATCH 01/30] update all workflows / actions --- .github/actions/setup-python-tools/action.yml | 28 +++----- .../run_command_on_active_checkout.yaml | 6 +- .github/scripts/update_docker_image_refs.sh | 15 +++++ .../workflows/build-base-docker-images.yml | 53 ++++++++++----- .github/workflows/on-pr-comment.yml | 45 ++++--------- .github/workflows/on-pr-merge.yml | 64 +++++++------------ .github/workflows/release-documentation.yml | 2 +- 7 files changed, 98 insertions(+), 115 deletions(-) create mode 100644 .github/scripts/update_docker_image_refs.sh diff --git a/.github/actions/setup-python-tools/action.yml b/.github/actions/setup-python-tools/action.yml index 9a7572a95..36e1b914c 100644 --- a/.github/actions/setup-python-tools/action.yml +++ b/.github/actions/setup-python-tools/action.yml @@ -37,28 +37,16 @@ runs: using: "composite" # needs: comment-link-to-workflow # Ensure that a comment is posted with workflow id steps: - # Step 1: Set up Python environment (Python 3.9.13). - - name: Set up Python - uses: actions/setup-python@v4 + # Step 1: Set up Python environment + - name: "Set up Python" + uses: actions/setup-python@v6 with: - # Available versions: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json - # Ensure to use a version that has support for arm64-darwin so we can build for Apple Silicon (macOS 14). - python-version: '3.9.13' + python-version-file: ".python-version" - # Step 2: Install pip-tools, which is used to generate hashed requirements. - # Note_1: pip 25.1 has a bug that causes pip-tools to fail with the following error: - # File ".../python3.9/site-packages/piptools/repositories/pypi.py", line 452, in allow_all_wheels - # self.finder.find_all_candidates.cache_clear() - # AttributeError: 'function' object has no attribute 'cache_clear' - # Note_2: Even though some wheels are guarded behind conditionals i.e. only use this if platform = linux; - # pip-tools 7.5.0 fails with the following error: - # pip._internal.exceptions.UnsupportedWheel: pyg_lib-0.4....linux_x86_64.whl is not a supported wheel on this platform. - # Thus, we fix the pip version to 25.0.1 and pip-tools version to 7.4.1. - - name: Install pip-tools - shell: bash - run: | - python -m pip install "pip==25.0.1" - python -m pip install "pip-tools==7.4.1" + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.9.5" # Matches the version in install_py_deps.sh # Step 3: Set up Gcloud AUTH using Workload Identity Federation # See following for context: https://cloud.google.com/blog/products/identity-security/enabling-keyless-authentication-from-github-actions diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 63ea9be26..bfb24936a 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:4698dabd13dbf7a4bb66c02186e69311f9583714.70.1 entrypoint: /bin/bash args: - -c @@ -18,6 +18,8 @@ steps: echo "Setting up environment..." # gcloud runner will run as a non-root user, but all paths/profiles, etc are set up for root + mkdir -p /builder/home/.local/bin + cp -r /root/.local/bin/ /builder/home/.local/ echo "source /root/.bashrc" >> ~/.bashrc echo "source /root/.profile" >> ~/.profile @@ -26,7 +28,7 @@ steps: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes gcloud auth configure-docker us-central1-docker.pkg.dev # Install GiGL - pip install -e ./python/ + uv pip install -e . # The builder operates in its own user dir, usually /workspace, # so we need to copy the gigl tools dir to the current cloud_builder's user dir. # See: containers/Dockerfile.builder. diff --git a/.github/scripts/update_docker_image_refs.sh b/.github/scripts/update_docker_image_refs.sh new file mode 100644 index 000000000..815b1eee2 --- /dev/null +++ b/.github/scripts/update_docker_image_refs.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Script to update dep_vars.env and cloud builder config with new Docker image references + +set -e + +echo "Writing new image names to dep_vars.env:" +echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" +echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" +echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" +echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" + +sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env +sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 9d848ecad..42efa70e8 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -6,6 +6,10 @@ on: pr_number: description: 'PR to run the workflow on' required: true + + # Strictly for testing purposes + # push: + env: DOCKER_BUILDKIT: 1 GIGL_BASE_CUDA_IMAGE: us-central1-docker.pkg.dev/${{ vars.GCP_PROJECT_ID }}/public-gigl/gigl-cuda-base:${{ github.sha }}.${{ github.run_number }}.${{ github.run_attempt }} @@ -16,6 +20,7 @@ env: jobs: comment-workflow-started: + if: ${{ github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest steps: - name: Comment on PR @@ -35,13 +40,17 @@ jobs: contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch + - name: Checkout PR Branch (on-dispatch) + if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} + - name: Checkout repository (on-push) + if: ${{ github.event_name == 'push' }} + uses: actions/checkout@v4 - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -63,13 +72,17 @@ jobs: contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch + - name: Checkout PR Branch (on-dispatch) + if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} + - name: Checkout repository (on-push) + if: ${{ github.event_name == 'push' }} + uses: actions/checkout@v4 - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -97,13 +110,17 @@ jobs: contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch + - name: Checkout PR Branch (on-dispatch) + if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} + - name: Checkout repository (on-push) + if: ${{ github.event_name == 'push' }} + uses: actions/checkout@v4 - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -128,22 +145,23 @@ jobs: - build-builder-image runs-on: ubuntu-latest steps: - - name: Commit and Push Base Images + + - name: Commit and Push Base Images (on-dispatch) + if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} should_leave_progress_comments: "false" - command: | - echo "Writing new image names to dep_vars.env:" - echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" - echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" - echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" - echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" - sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env - sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml + command: bash .github/scripts/update_docker_image_refs.sh + + - name: Checkout repository (on-push) + if: ${{ github.event_name == 'push' }} + uses: actions/checkout@v4 + + - name: Commit and Push Base Images (on-push) + if: ${{ github.event_name == 'push' }} + run: bash .github/scripts/update_docker_image_refs.sh - name: Commit and Push Dep Vars uses: snapchat/gigl/.github/actions/commit-and-push@main @@ -152,6 +170,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} - uses: snapchat/gigl/.github/actions/comment-on-pr@main + if: ${{ github.event_name == 'workflow_dispatch' }} with: pr_number: ${{ inputs.pr_number }} message: | diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml index 4d6032747..f693fb8fb 100644 --- a/.github/workflows/on-pr-comment.yml +++ b/.github/workflows/on-pr-comment.yml @@ -24,7 +24,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version-file: ".python-version" - name: Install PyYAML run: pip install PyYAML @@ -39,19 +39,18 @@ jobs: pr_number: ${{ github.event.issue.number }} message: ${{ steps.parse_commands.outputs.help_message }} - unit-test-python: - if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_py') || endsWith(github.event.comment.body, '/unit_test')) }} + unit-test: + if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/unit_test') }} runs-on: ubuntu-latest - # TODO(kmonte): Reduce this :( - timeout-minutes: 120 + timeout-minutes: 55 steps: - - name: Run Python Unit Tests + - name: Run Unit Tests uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ github.event.issue.number }} should_leave_progress_comments: "true" - descriptive_workflow_name: "Python Unit Test" + descriptive_workflow_name: "Unit Test" setup_gcloud: "true" # We use cloud run here instead of using github hosted runners because of limitation of tests # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand @@ -62,33 +61,11 @@ jobs: workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | - make unit_test_py - - unit-test-scala: - if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_scala') || endsWith(github.event.comment.body, '/unit_test')) }} - runs-on: ubuntu-latest - timeout-minutes: 20 - steps: - - name: Run Scala Unit Tests - uses: snapchat/gigl/.github/actions/run-command-on-pr@main - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - pr_number: ${{ github.event.issue.number }} - should_leave_progress_comments: "true" - descriptive_workflow_name: "Scala Unit Test" - setup_gcloud: "true" - use_cloud_run: "true" - gcp_project_id: ${{ vars.GCP_PROJECT_ID }} - workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} - gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} - command: | - make unit_test_scala - + make unit_test integration-test: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/integration_test') }} runs-on: ubuntu-latest - # TODO(kmonte): Reduce this :( - timeout-minutes: 120 + timeout-minutes: 70 # Tests as of 2025-05-16 are taking ~50 mins to complete, 40% buffer steps: - name: Run Integration Tests uses: snapchat/gigl/.github/actions/run-command-on-pr@main @@ -104,7 +81,6 @@ jobs: gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | make integration_test - integration-e2e-test: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/e2e_test') }} runs-on: ubuntu-latest @@ -164,6 +140,7 @@ jobs: workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index 75bfccd9d..c8ad6af6f 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -7,6 +7,9 @@ on: pull_request: merge_group: + # Strictly for testing purposes + push: + permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth id-token: 'write' @@ -16,89 +19,66 @@ permissions: issues: 'write' jobs: - ci-unit-test-python: + ci-unit-test: # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738 # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. # Once before it gets into the merge queue and once when it is in the merge queue. # Our tests take a long time to run, so this is not ideal. - if: github.event_name == 'merge_group' + # if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - - name: Run Python Unit Tests + - name: Run Unit Tests # We use cloud run here instead of using github hosted runners because of limitation of tests # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand # how to leverage Workload Identity Federation to read assets from GCS, et al. See: # https://github.com/tensorflow/tensorflow/issues/57104 - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main - with: - cmd: "make unit_test_py" - service_account: ${{ secrets.gcp_service_account_email }} - project: ${{ vars.GCP_PROJECT_ID }} - - ci-unit-test-scala: - # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738 - # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. - # Once before it gets into the merge queue and once when it is in the merge queue. - # Our tests take a long time to run, so this is not ideal. - if: github.event_name == 'merge_group' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main - with: - setup_gcloud: "true" - gcp_project_id: ${{ vars.GCP_PROJECT_ID }} - workload_identity_provider: ${{ secrets.workload_identity_provider }} - gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - - name: Run Scala Unit Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: - cmd: "make unit_test_scala" + cmd: "make unit_test" service_account: ${{ secrets.gcp_service_account_email }} project: ${{ vars.GCP_PROJECT_ID }} ci-integration-test: - if: github.event_name == 'merge_group' + # if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Integration Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make integration_test" service_account: ${{ secrets.gcp_service_account_email }} project: ${{ vars.GCP_PROJECT_ID }} ci-integration-e2e-test: - if: github.event_name == 'merge_group' + # if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run E2E Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make run_all_e2e_tests" service_account: ${{ secrets.gcp_service_account_email }} @@ -111,26 +91,26 @@ jobs: # steps: # - uses: actions/checkout@v4 # - name: Setup development environment - # uses: snapchat/gigl/.github/actions/setup-python-tools@main + # uses: ./.github/actions/setup-python-tools # with: # setup_gcloud: "true" # gcp_project_id: ${{ vars.GCP_PROJECT_ID }} # workload_identity_provider: ${{ secrets.workload_identity_provider }} # gcp_service_account_email: ${{ secrets.gcp_service_account_email }} # - name: Run Example Notebook E2E Tests - # uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + # uses: ./.github/actions/run-cloud-run-command-on-active-checkout # with: # cmd: "make notebooks_test" # service_account: ${{ secrets.gcp_service_account_email }} # project: ${{ vars.GCP_PROJECT_ID }} ci-lint-test: - if: github.event_name == 'merge_group' + # if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: install_dev_deps: "true" setup_gcloud: "true" @@ -138,7 +118,9 @@ jobs: workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Lint Tests + shell: bash run: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index 5e9345b61..bc25d47ba 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -44,7 +44,7 @@ jobs: # We also make gigl available w/ editable install `-e` so that autodoc can find it. - name: Install necessary doc dependencies run: | - pip install -e "./python[docs]" + uv sync --extra docs - name: Sphinx build run: | make build_docs From c16b9720cda7265caafd0bbcf4df93bd92604f84 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 05:27:50 +0000 Subject: [PATCH 02/30] build base docker images --- .github/workflows/build-base-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 42efa70e8..d080fb10a 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -8,7 +8,7 @@ on: required: true # Strictly for testing purposes - # push: + push: env: DOCKER_BUILDKIT: 1 From f5947ca8e680ac787e27229cfcbff640501712d3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 26 Nov 2025 06:13:52 +0000 Subject: [PATCH 03/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index bfb24936a..1c0c8c3f8 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:4698dabd13dbf7a4bb66c02186e69311f9583714.70.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 09af0820d..10b9c3c41 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 9b51d6eb073fb0604d88b6601fd3eb81e1fa1c25 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 06:28:27 +0000 Subject: [PATCH 04/30] migrate away from p100 --- deployment/configs/e2e_cicd_resource_config.yaml | 2 +- deployment/configs/unittest_resource_config.yaml | 2 +- docs/examples/configs/template_resource_config.yaml | 4 ++-- docs/user_guide/config_guides/resource_config_guide.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deployment/configs/e2e_cicd_resource_config.yaml b/deployment/configs/e2e_cicd_resource_config.yaml index ec6fa9540..21fe6ab0a 100644 --- a/deployment/configs/e2e_cicd_resource_config.yaml +++ b/deployment/configs/e2e_cicd_resource_config.yaml @@ -44,7 +44,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/deployment/configs/unittest_resource_config.yaml b/deployment/configs/unittest_resource_config.yaml index 2b6666ec3..c9335d81d 100644 --- a/deployment/configs/unittest_resource_config.yaml +++ b/deployment/configs/unittest_resource_config.yaml @@ -46,7 +46,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training + gpu_type: NVIDIA_TESLA_T4 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/docs/examples/configs/template_resource_config.yaml b/docs/examples/configs/template_resource_config.yaml index f8335b501..f87d315f8 100644 --- a/docs/examples/configs/template_resource_config.yaml +++ b/docs/examples/configs/template_resource_config.yaml @@ -31,11 +31,11 @@ split_generator_config: # Dataproc config trainer_config: vertex_ai_trainer_config: # or local_trainer_config machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 2 inferencer_config: num_workers: 1 max_num_workers: 256 machine_type: "c3-standard-22" - disk_size_gb: 100 \ No newline at end of file + disk_size_gb: 100 diff --git a/docs/user_guide/config_guides/resource_config_guide.md b/docs/user_guide/config_guides/resource_config_guide.md index d909650c4..decc23df1 100644 --- a/docs/user_guide/config_guides/resource_config_guide.md +++ b/docs/user_guide/config_guides/resource_config_guide.md @@ -54,7 +54,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "" # e.g. n1-highmem-16 - gpu_type: "" # e.g. nvidia-tesla-p100 + gpu_type: "" # e.g. NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 1 inferencer_config: From 1f83376e7649351b10c54cb7ae6637961f763e25 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 17:54:43 +0000 Subject: [PATCH 05/30] try 2 --- containers/Dockerfile.cuda.base | 1 + 1 file changed, 1 insertion(+) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index e8b8f39fe..d5f829408 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -22,6 +22,7 @@ RUN apt-get update \ WORKDIR /gigl_deps +# Copy necessary requirements files COPY pyproject.toml pyproject.toml COPY uv.lock uv.lock COPY requirements requirements From a92da09034b3c15cbec95ddca134092d66241780 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 21:41:56 +0000 Subject: [PATCH 06/30] try 3 --- containers/Dockerfile.cuda.base | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index d5f829408..886341bfc 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -20,6 +20,21 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + wget \ + cmake \ + iputils-ping \ + curl \ + gnupg \ + telnet \ + nano \ + net-tools \ + unzip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /gigl_deps # Copy necessary requirements files From 801498a38c10ee7dbd56e293ce8ea07b9936c304 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 21:42:01 +0000 Subject: [PATCH 07/30] try 3 --- containers/Dockerfile.cuda.base | 6 ------ 1 file changed, 6 deletions(-) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index 886341bfc..bec067d67 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -14,12 +14,6 @@ ENV UV_PROJECT_ENVIRONMENT=/opt/conda/ # Install basic dependencies # TODO(mkolodner-sc): iputils-ping temporarily needed to setup inter-job VAI communication for GLT Inference. # Once VAI natively supports this communication, we can remove this requirement. -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y build-essential git wget cmake iputils-ping curl \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - RUN apt-get update && apt-get install -y \ build-essential \ git \ From 13cbbfc6265cda9bafee2c35cf2cf612aabcd34f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 26 Nov 2025 21:55:25 +0000 Subject: [PATCH 08/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 1c0c8c3f8..f2cc9f121 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 10b9c3c41..2b1716ead 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:016001ed5ac56d11f9b3a243ec69d2ce9a8ac38a.73.1 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 1633a80de2cf7deaee4d326ccf3a788de4ade82e Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 26 Nov 2025 21:59:24 +0000 Subject: [PATCH 09/30] try 4 --- .github/cloud_builder/run_command_on_active_checkout.yaml | 1 + containers/Dockerfile.cuda.base | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index f2cc9f121..7e9435e75 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -24,6 +24,7 @@ steps: echo "source /root/.profile" >> ~/.profile source ~/.profile + docker version docker buildx create --driver=docker-container --use docker run --rm --privileged multiarch/qemu-user-static --reset -p yes gcloud auth configure-docker us-central1-docker.pkg.dev diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index bec067d67..ba4e0beed 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -21,10 +21,6 @@ RUN apt-get update && apt-get install -y \ cmake \ iputils-ping \ curl \ - gnupg \ - telnet \ - nano \ - net-tools \ unzip \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From 24a755f9dc701ebb56534f61ac52d69ea788d75f Mon Sep 17 00:00:00 2001 From: svij Date: Sat, 29 Nov 2025 05:33:18 +0000 Subject: [PATCH 10/30] add gnupg --- containers/Dockerfile.cuda.base | 1 + 1 file changed, 1 insertion(+) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index ba4e0beed..6d4fbbb58 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -21,6 +21,7 @@ RUN apt-get update && apt-get install -y \ cmake \ iputils-ping \ curl \ + gnupg \ unzip \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From 23a251db6cfc43f293c50469585b0623dbc5caf1 Mon Sep 17 00:00:00 2001 From: svij Date: Sat, 29 Nov 2025 05:48:19 +0000 Subject: [PATCH 11/30] umm --- containers/Dockerfile.cuda.base | 3 +++ 1 file changed, 3 insertions(+) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index 6d4fbbb58..bec067d67 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -22,6 +22,9 @@ RUN apt-get update && apt-get install -y \ iputils-ping \ curl \ gnupg \ + telnet \ + nano \ + net-tools \ unzip \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From 578dc2d531164aa86976a4e218e87576e9497f85 Mon Sep 17 00:00:00 2001 From: svij Date: Sat, 29 Nov 2025 06:03:31 +0000 Subject: [PATCH 12/30] try --- containers/Dockerfile.cuda.base | 4 ---- 1 file changed, 4 deletions(-) diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index bec067d67..ba4e0beed 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -21,10 +21,6 @@ RUN apt-get update && apt-get install -y \ cmake \ iputils-ping \ curl \ - gnupg \ - telnet \ - nano \ - net-tools \ unzip \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From 8b8d586232619ff3ce01cea2e1ffe08fec948f1b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 29 Nov 2025 23:31:12 +0000 Subject: [PATCH 13/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 7e9435e75..908a717d5 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:578dc2d531164aa86976a4e218e87576e9497f85.83.2 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 2b1716ead..2d63b7008 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:801498a38c10ee7dbd56e293ce8ea07b9936c304.79.1 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 3a2f7193078b2e360e53636bec257e48eb56b9d9 Mon Sep 17 00:00:00 2001 From: svij Date: Sat, 29 Nov 2025 23:39:26 +0000 Subject: [PATCH 14/30] update docker image build ci --- .../workflows/build-base-docker-images.yml | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index d080fb10a..16463a45a 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -34,7 +34,7 @@ jobs: Once done, the workflow will update the `dep_vars.env` file with the new image names. build-cuda-base-image: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -65,8 +65,9 @@ jobs: docker push ${GIGL_BASE_CUDA_IMAGE} echo "Pushed CUDA base image to ${GIGL_BASE_CUDA_IMAGE}" - build-cpu-base-images: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + build-cpu-base-image: + runs-on: ubuntu-latest + # runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -89,7 +90,6 @@ jobs: gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} - - name: Build and Push CPU Base Image and Docker CPU Image run: | gcloud auth configure-docker us-central1-docker.pkg.dev @@ -98,8 +98,36 @@ jobs: docker push ${GIGL_BASE_CPU_IMAGE} echo "Pushed CPU base image to ${GIGL_BASE_CPU_IMAGE}" - echo "Will use CPU image ${GIGL_BASE_CPU_IMAGE} as base image for Dataflow image." - docker build -f ./containers/Dockerfile.dataflow.base --build-arg BASE_IMAGE=${GIGL_BASE_CPU_IMAGE} -t ${GIGL_BASE_DATAFLOW_IMAGE} . + build-dataflow-base-image: + runs-on: ubuntu-latest + # runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD + permissions: + # Needed for gcloud auth: https://github.com/google-github-actions/auth + contents: 'read' + id-token: 'write' + steps: + - name: Checkout PR Branch (on-dispatch) + if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + pr_number: ${{ inputs.pr_number }} + - name: Checkout repository (on-push) + if: ${{ github.event_name == 'push' }} + uses: actions/checkout@v4 + - name: Setup Machine for building Docker images + uses: ./.github/actions/setup-python-tools + with: + setup_gcloud: "true" + try_cleaning_disk_space: "true" + gcp_project_id: ${{ vars.GCP_PROJECT_ID }} + workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} + gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + - name: Build and Push Dataflow Base Image + run: | + gcloud auth configure-docker us-central1-docker.pkg.dev + + docker build -f ./containers/Dockerfile.dataflow.base -t ${GIGL_BASE_DATAFLOW_IMAGE} . docker push ${GIGL_BASE_DATAFLOW_IMAGE} echo "Pushed Dataflow base image to ${GIGL_BASE_DATAFLOW_IMAGE}" @@ -141,7 +169,8 @@ jobs: build-and-commit-base-images: needs: - build-cuda-base-image - - build-cpu-base-images + - build-cpu-base-image + - build-dataflow-base-image - build-builder-image runs-on: ubuntu-latest steps: From f2fb1e15803fc2c4a332bf6d7fb16d75df6b6059 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 29 Nov 2025 23:48:49 +0000 Subject: [PATCH 15/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 908a717d5..93ff80011 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:578dc2d531164aa86976a4e218e87576e9497f85.83.2 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 2d63b7008..3a1e42e9b 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:578dc2d531164aa86976a4e218e87576e9497f85.83.2 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 12ff5e365e7263b983aa51422346ab65f7fda97f Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 1 Dec 2025 17:20:09 +0000 Subject: [PATCH 16/30] reverting changes --- .github/workflows/on-pr-comment.yml | 37 +++++++++++++++++++++++------ .github/workflows/on-pr-merge.yml | 32 ++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml index f693fb8fb..9061fec7d 100644 --- a/.github/workflows/on-pr-comment.yml +++ b/.github/workflows/on-pr-comment.yml @@ -39,18 +39,19 @@ jobs: pr_number: ${{ github.event.issue.number }} message: ${{ steps.parse_commands.outputs.help_message }} - unit-test: - if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/unit_test') }} + unit-test-python: + if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_py') || endsWith(github.event.comment.body, '/unit_test')) }} runs-on: ubuntu-latest - timeout-minutes: 55 + # TODO(kmonte): Reduce this :( + timeout-minutes: 120 steps: - - name: Run Unit Tests + - name: Run Python Unit Tests uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ github.event.issue.number }} should_leave_progress_comments: "true" - descriptive_workflow_name: "Unit Test" + descriptive_workflow_name: "Python Unit Test" setup_gcloud: "true" # We use cloud run here instead of using github hosted runners because of limitation of tests # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand @@ -61,11 +62,33 @@ jobs: workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | - make unit_test + make unit_test_py + + unit-test-scala: + if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_scala') || endsWith(github.event.comment.body, '/unit_test')) }} + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - name: Run Scala Unit Tests + uses: snapchat/gigl/.github/actions/run-command-on-pr@main + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + pr_number: ${{ github.event.issue.number }} + should_leave_progress_comments: "true" + descriptive_workflow_name: "Scala Unit Test" + setup_gcloud: "true" + use_cloud_run: "true" + gcp_project_id: ${{ vars.GCP_PROJECT_ID }} + workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} + gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + command: | + make unit_test_scala + integration-test: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/integration_test') }} runs-on: ubuntu-latest - timeout-minutes: 70 # Tests as of 2025-05-16 are taking ~50 mins to complete, 40% buffer + # TODO(kmonte): Reduce this :( + timeout-minutes: 120 steps: - name: Run Integration Tests uses: snapchat/gigl/.github/actions/run-command-on-pr@main diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index c8ad6af6f..85f3c79e8 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -19,7 +19,7 @@ permissions: issues: 'write' jobs: - ci-unit-test: + ci-unit-test-python: # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738 # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. # Once before it gets into the merge queue and once when it is in the merge queue. @@ -35,14 +35,40 @@ jobs: gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - - name: Run Unit Tests + - name: Run Python Unit Tests # We use cloud run here instead of using github hosted runners because of limitation of tests # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand # how to leverage Workload Identity Federation to read assets from GCS, et al. See: # https://github.com/tensorflow/tensorflow/issues/57104 uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: - cmd: "make unit_test" + cmd: "make unit_test_py" + service_account: ${{ secrets.gcp_service_account_email }} + project: ${{ vars.GCP_PROJECT_ID }} + ci-unit-test-scala: + # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738 + # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. + # Once before it gets into the merge queue and once when it is in the merge queue. + # Our tests take a long time to run, so this is not ideal. + # if: github.event_name == 'merge_group' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup development environment + uses: ./.github/actions/setup-python-tools + with: + setup_gcloud: "true" + gcp_project_id: ${{ vars.GCP_PROJECT_ID }} + workload_identity_provider: ${{ secrets.workload_identity_provider }} + gcp_service_account_email: ${{ secrets.gcp_service_account_email }} + - name: Run Scala Unit Tests + # We use cloud run here instead of using github hosted runners because of limitation of tests + # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand + # how to leverage Workload Identity Federation to read assets from GCS, et al. See: + # https://github.com/tensorflow/tensorflow/issues/57104 + uses: ./.github/actions/run-cloud-run-command-on-active-checkout + with: + cmd: "make unit_test_scala" service_account: ${{ secrets.gcp_service_account_email }} project: ${{ vars.GCP_PROJECT_ID }} From dd2356afe93dee4abb937c2b70d7d4a271c39aba Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 1 Dec 2025 17:31:24 +0000 Subject: [PATCH 17/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 93ff80011..53d7cd64a 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 3a1e42e9b..78132c231 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:3a2f7193078b2e360e53636bec257e48eb56b9d9.84.1 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 2af743fdff22851b3212b7f1cb5ebbc541d8bf54 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 1 Dec 2025 19:38:23 +0000 Subject: [PATCH 18/30] rebuild builder image --- .github/workflows/build-base-docker-images.yml | 2 +- containers/Dockerfile.builder | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 16463a45a..427dd47e0 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -7,7 +7,7 @@ on: description: 'PR to run the workflow on' required: true - # Strictly for testing purposes + # Uncomment strictly for testing purposes push: env: diff --git a/containers/Dockerfile.builder b/containers/Dockerfile.builder index 2e27af703..45f137553 100644 --- a/containers/Dockerfile.builder +++ b/containers/Dockerfile.builder @@ -25,8 +25,16 @@ RUN apt-get update && apt-get install && apt-get install -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# As of Dec 1, 2025: +# GCP Cloud build agents run an older version of docker deamon +# with max Docker API version support of 1.41. https://docs.cloud.google.com/build/docs/overview#docker +# At the time of writing Docker Client > v28 has deprecated support for < v1.44. +# https://docs.docker.com/engine/release-notes/29/#breaking-changes +# Thus we use v28.5.2, and also manually set the API version to 1.41 to ensure compatibility. +ENV DOCKER_CLIENT_VERSION=28.5.2 +ENV DOCKER_API_VERSION=1.41 RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ - sh get-docker.sh && \ + sh get-docker.sh --version ${DOCKER_CLIENT_VERSION} && \ rm get-docker.sh # Install Google Cloud CLI From 2b5accbf224638fdad8c9415e9bb88333ce1fe6e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 1 Dec 2025 19:48:05 +0000 Subject: [PATCH 19/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 53d7cd64a..e417363fe 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index 78132c231..a8b1dd918 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:12ff5e365e7263b983aa51422346ab65f7fda97f.85.1 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From ea58f5b7551e319d606be4f3d2e19238fb0bfc32 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 1 Dec 2025 20:05:43 +0000 Subject: [PATCH 20/30] push lock --- .github/workflows/build-base-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 427dd47e0..6114f4ced 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -8,7 +8,7 @@ on: required: true # Uncomment strictly for testing purposes - push: + # push: env: DOCKER_BUILDKIT: 1 From fbe84b17fce632621f54e8fbcafa2d621794e199 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 1 Dec 2025 21:22:37 +0000 Subject: [PATCH 21/30] remove init --- containers/Dockerfile.dataflow.src | 2 -- containers/Dockerfile.src | 1 - 2 files changed, 3 deletions(-) diff --git a/containers/Dockerfile.dataflow.src b/containers/Dockerfile.dataflow.src index f1c2e7a74..78fb6213e 100644 --- a/containers/Dockerfile.dataflow.src +++ b/containers/Dockerfile.dataflow.src @@ -5,8 +5,6 @@ FROM $BASE_IMAGE # Copy the source WORKDIR /gigl -RUN touch __init__.py - COPY MANIFEST.in MANIFEST.in COPY pyproject.toml pyproject.toml COPY uv.lock uv.lock diff --git a/containers/Dockerfile.src b/containers/Dockerfile.src index ebab68c77..48480392c 100644 --- a/containers/Dockerfile.src +++ b/containers/Dockerfile.src @@ -8,7 +8,6 @@ WORKDIR /gigl # Note: main package files must live in root of the repo for the python package to be built correctly for Dataflow workers. # See https://beam.apache.org/documentation/sdks/python-pipxeline-dependencies/#create-reproducible-environments. WORKDIR /gigl -RUN touch __init__.py COPY MANIFEST.in MANIFEST.in COPY pyproject.toml pyproject.toml From 116b899e4e78c014b45d1876455237d7b8b3bbe3 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 1 Dec 2025 23:28:52 +0000 Subject: [PATCH 22/30] test --- Makefile | 2 +- containers/Dockerfile.src | 2 -- testing/e2e_tests/e2e_tests.yaml | 8 ++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index aba4bf01a..6c0c75fde 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ assert_yaml_configs_parse: # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` # By default, runs all tests under python/tests/unit. # See the help text for "--test_file_pattern" in python/tests/test_args.py for more details. -unit_test_py: clean_build_files_py type_check +unit_test_py: clean_build_files_py # type_check # TODO (svij) Uncomment prior to merging ( cd python ; \ uv run python -m tests.unit.main \ --env=test \ diff --git a/containers/Dockerfile.src b/containers/Dockerfile.src index 48480392c..911c3d6b7 100644 --- a/containers/Dockerfile.src +++ b/containers/Dockerfile.src @@ -18,5 +18,3 @@ COPY python python COPY examples examples RUN uv pip install -e . - -WORKDIR / diff --git a/testing/e2e_tests/e2e_tests.yaml b/testing/e2e_tests/e2e_tests.yaml index b084b9479..44b4445f0 100644 --- a/testing/e2e_tests/e2e_tests.yaml +++ b/testing/e2e_tests/e2e_tests.yaml @@ -2,16 +2,16 @@ # This file contains all the test specifications that can be run via the e2e test script tests: cora_nalp_test: - task_config_uri: "gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_snc_test: - task_config_uri: "gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_udl_test: - task_config_uri: "gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" dblp_nalp_test: - task_config_uri: "gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" hom_cora_sup_test: task_config_uri: "examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml" From 8d1eb64658b39acb445b76f00ea1b5b919fe4497 Mon Sep 17 00:00:00 2001 From: svij Date: Wed, 3 Dec 2025 00:16:54 +0000 Subject: [PATCH 23/30] comment --- Makefile | 2 +- containers/Dockerfile.cuda.base | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fe3f4af75..ca0c56fb0 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ assert_yaml_configs_parse: # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` # By default, runs all tests under python/tests/unit. # See the help text for "--test_file_pattern" in python/tests/test_args.py for more details. -unit_test_py: clean_build_files_py # type_check # TODO (svij) Uncomment prior to merging +unit_test_py: clean_build_files_py # type_check # TODO (svij-sc) Fixed in https://github.com/Snapchat/GiGL/pull/401 ( cd python ; \ uv run python -m tests.unit.main \ --env=test \ diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base index ba4e0beed..df09e88bf 100644 --- a/containers/Dockerfile.cuda.base +++ b/containers/Dockerfile.cuda.base @@ -27,7 +27,6 @@ RUN apt-get update && apt-get install -y \ WORKDIR /gigl_deps -# Copy necessary requirements files COPY pyproject.toml pyproject.toml COPY uv.lock uv.lock COPY requirements requirements From d4f0a0e7ad347a3bfcd7f24d667e3fe4645aa345 Mon Sep 17 00:00:00 2001 From: svij Date: Fri, 5 Dec 2025 02:55:03 +0000 Subject: [PATCH 24/30] push new images --- .github/workflows/build-base-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 6114f4ced..427dd47e0 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -8,7 +8,7 @@ on: required: true # Uncomment strictly for testing purposes - # push: + push: env: DOCKER_BUILDKIT: 1 From 51af343c1c298ab465a96ecffd4e50ea6dffacb7 Mon Sep 17 00:00:00 2001 From: svij Date: Fri, 5 Dec 2025 03:10:31 +0000 Subject: [PATCH 25/30] lock pip --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index e3df8684b..10f25efd8 100644 --- a/uv.lock +++ b/uv.lock @@ -893,7 +893,7 @@ requires-dist = [ { name = "numpy" }, { name = "omegaconf", specifier = ">=2.3.0,<3.0.0" }, { name = "pandas" }, - { name = "pip" }, + { name = "pip", specifier = "~=25.3" }, { name = "protobuf" }, { name = "pyarrow", marker = "extra == 'transform'", specifier = "==10.0.1" }, { name = "pyg-lib", marker = "sys_platform != 'darwin' and extra == 'pyg27-torch28-cpu'", index = "https://data.pyg.org/whl/torch-2.8.0+cpu.html", conflict = { package = "gigl", extra = "pyg27-torch28-cpu" } }, From 15f1eb92af998e64419d0f5df55a1aa16e54886c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 5 Dec 2025 03:19:41 +0000 Subject: [PATCH 26/30] [AUTOMATED] Update dep.vars, and other relevant files with new image names --- .github/cloud_builder/run_command_on_active_checkout.yaml | 2 +- dep_vars.env | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index e417363fe..91135a88e 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 entrypoint: /bin/bash args: - -c diff --git a/dep_vars.env b/dep_vars.env index a8b1dd918..784ed142f 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:2af743fdff22851b3212b7f1cb5ebbc541d8bf54.86.1 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 From 84fe7f6f2f61c51981bc94ce37c350d5ce551eb9 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 8 Dec 2025 21:41:56 +0000 Subject: [PATCH 27/30] revert on push --- .../workflows/build-base-docker-images.yml | 34 +++---------------- .github/workflows/on-pr-merge.yml | 13 +++---- 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 427dd47e0..c799a896d 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -7,9 +7,6 @@ on: description: 'PR to run the workflow on' required: true - # Uncomment strictly for testing purposes - push: - env: DOCKER_BUILDKIT: 1 GIGL_BASE_CUDA_IMAGE: us-central1-docker.pkg.dev/${{ vars.GCP_PROJECT_ID }}/public-gigl/gigl-cuda-base:${{ github.sha }}.${{ github.run_number }}.${{ github.run_attempt }} @@ -40,15 +37,12 @@ jobs: contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch (on-dispatch) + - name: Checkout PR Branch if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - - name: Checkout repository (on-push) - if: ${{ github.event_name == 'push' }} - uses: actions/checkout@v4 - name: Setup Machine for building Docker images uses: ./.github/actions/setup-python-tools with: @@ -67,21 +61,17 @@ jobs: build-cpu-base-image: runs-on: ubuntu-latest - # runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch (on-dispatch) + - name: Checkout PR Branch if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - - name: Checkout repository (on-push) - if: ${{ github.event_name == 'push' }} - uses: actions/checkout@v4 - name: Setup Machine for building Docker images uses: ./.github/actions/setup-python-tools with: @@ -100,21 +90,17 @@ jobs: build-dataflow-base-image: runs-on: ubuntu-latest - # runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch (on-dispatch) + - name: Checkout PR Branch if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - - name: Checkout repository (on-push) - if: ${{ github.event_name == 'push' }} - uses: actions/checkout@v4 - name: Setup Machine for building Docker images uses: ./.github/actions/setup-python-tools with: @@ -138,15 +124,12 @@ jobs: contents: 'read' id-token: 'write' steps: - - name: Checkout PR Branch (on-dispatch) + - name: Checkout PR Branch if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - - name: Checkout repository (on-push) - if: ${{ github.event_name == 'push' }} - uses: actions/checkout@v4 - name: Setup Machine for building Docker images uses: ./.github/actions/setup-python-tools with: @@ -175,7 +158,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Commit and Push Base Images (on-dispatch) + - name: Commit and Push Base Images if: ${{ github.event_name == 'workflow_dispatch' }} uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: @@ -184,13 +167,6 @@ jobs: should_leave_progress_comments: "false" command: bash .github/scripts/update_docker_image_refs.sh - - name: Checkout repository (on-push) - if: ${{ github.event_name == 'push' }} - uses: actions/checkout@v4 - - - name: Commit and Push Base Images (on-push) - if: ${{ github.event_name == 'push' }} - run: bash .github/scripts/update_docker_image_refs.sh - name: Commit and Push Dep Vars uses: snapchat/gigl/.github/actions/commit-and-push@main diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index 85f3c79e8..4ca2fe96d 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -7,9 +7,6 @@ on: pull_request: merge_group: - # Strictly for testing purposes - push: - permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth id-token: 'write' @@ -24,7 +21,7 @@ jobs: # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. # Once before it gets into the merge queue and once when it is in the merge queue. # Our tests take a long time to run, so this is not ideal. - # if: github.event_name == 'merge_group' + if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -50,7 +47,7 @@ jobs: # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. # Once before it gets into the merge queue and once when it is in the merge queue. # Our tests take a long time to run, so this is not ideal. - # if: github.event_name == 'merge_group' + if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -73,7 +70,7 @@ jobs: project: ${{ vars.GCP_PROJECT_ID }} ci-integration-test: - # if: github.event_name == 'merge_group' + if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -92,7 +89,7 @@ jobs: project: ${{ vars.GCP_PROJECT_ID }} ci-integration-e2e-test: - # if: github.event_name == 'merge_group' + if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -131,7 +128,7 @@ jobs: # project: ${{ vars.GCP_PROJECT_ID }} ci-lint-test: - # if: github.event_name == 'merge_group' + if: github.event_name == 'merge_group' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 From 2aed4258cdc7d6d7c684ec6070a8e2fda305a137 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 8 Dec 2025 21:44:25 +0000 Subject: [PATCH 28/30] revert --- .github/workflows/build-base-docker-images.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index c799a896d..2f4a00e94 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -17,7 +17,7 @@ env: jobs: comment-workflow-started: - if: ${{ github.event_name == 'workflow_dispatch' }} + runs-on: ubuntu-latest steps: - name: Comment on PR @@ -38,7 +38,7 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -67,7 +67,7 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -96,7 +96,7 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -125,7 +125,7 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -159,7 +159,7 @@ jobs: steps: - name: Commit and Push Base Images - if: ${{ github.event_name == 'workflow_dispatch' }} + uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -175,7 +175,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} - uses: snapchat/gigl/.github/actions/comment-on-pr@main - if: ${{ github.event_name == 'workflow_dispatch' }} + with: pr_number: ${{ inputs.pr_number }} message: | From e406f1eaa5af461310011f6d4cf5120630a8fe05 Mon Sep 17 00:00:00 2001 From: svij Date: Mon, 8 Dec 2025 21:50:43 +0000 Subject: [PATCH 29/30] new lines --- .github/workflows/build-base-docker-images.yml | 11 ----------- .github/workflows/on-pr-comment.yml | 1 + .github/workflows/on-pr-merge.yml | 1 + 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 2f4a00e94..faec56e8e 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -38,7 +38,6 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -67,7 +66,6 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -96,7 +94,6 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -112,7 +109,6 @@ jobs: - name: Build and Push Dataflow Base Image run: | gcloud auth configure-docker us-central1-docker.pkg.dev - docker build -f ./containers/Dockerfile.dataflow.base -t ${GIGL_BASE_DATAFLOW_IMAGE} . docker push ${GIGL_BASE_DATAFLOW_IMAGE} echo "Pushed Dataflow base image to ${GIGL_BASE_DATAFLOW_IMAGE}" @@ -125,7 +121,6 @@ jobs: id-token: 'write' steps: - name: Checkout PR Branch - uses: snapchat/gigl/.github/actions/checkout-pr-branch@main with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -157,25 +152,19 @@ jobs: - build-builder-image runs-on: ubuntu-latest steps: - - name: Commit and Push Base Images - uses: snapchat/gigl/.github/actions/run-command-on-pr@main with: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} should_leave_progress_comments: "false" command: bash .github/scripts/update_docker_image_refs.sh - - - name: Commit and Push Dep Vars uses: snapchat/gigl/.github/actions/commit-and-push@main with: commit_message: "[AUTOMATED] Update dep.vars, and other relevant files with new image names" github_token: ${{ secrets.GITHUB_TOKEN }} - - uses: snapchat/gigl/.github/actions/comment-on-pr@main - with: pr_number: ${{ inputs.pr_number }} message: | diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml index 9061fec7d..cc57aed64 100644 --- a/.github/workflows/on-pr-comment.yml +++ b/.github/workflows/on-pr-comment.yml @@ -104,6 +104,7 @@ jobs: gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | make integration_test + integration-e2e-test: if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/e2e_test') }} runs-on: ubuntu-latest diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index 4ca2fe96d..0e1f9ddd0 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -42,6 +42,7 @@ jobs: cmd: "make unit_test_py" service_account: ${{ secrets.gcp_service_account_email }} project: ${{ vars.GCP_PROJECT_ID }} + ci-unit-test-scala: # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738 # We skip when the workflow is triggered by a pull_request event; otherwise we will run the check twice. From 0e77ed502d6f384a7ef3798ee7ab5ced06ff8b78 Mon Sep 17 00:00:00 2001 From: svij-sc Date: Fri, 12 Dec 2025 10:29:57 -0800 Subject: [PATCH 30/30] [6/6 Deps Update] Type check and format (#401) Co-authored-by: github-actions[bot] --- Makefile | 4 ++-- mypy.ini | 2 +- .../utils/compute/serialization/serialize_protos.py | 8 ++++---- python/gigl/distributed/utils/networking.py | 2 +- python/gigl/nn/models.py | 6 +++--- .../graphsage_template_modeling_spec.py | 10 ++++++++-- .../node_classification_modeling_task_spec.py | 2 +- .../gigl/src/common/modeling_task_specs/utils/infer.py | 4 ++-- .../src/common/models/layers/feature_interaction.py | 3 ++- python/gigl/src/common/models/layers/loss.py | 2 +- python/gigl/src/common/models/layers/task.py | 2 +- python/gigl/src/mocking/lib/pyg_datasets_forks.py | 4 ++-- .../common/modeling_task_spec_utils/early_stop_test.py | 2 +- 13 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 18922bb7c..8d11a4e50 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ assert_yaml_configs_parse: # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` # By default, runs all tests under python/tests/unit. # See the help text for "--test_file_pattern" in python/tests/test_args.py for more details. -unit_test_py: clean_build_files_py # type_check # TODO (svij-sc) Fixed in https://github.com/Snapchat/GiGL/pull/401 +unit_test_py: clean_build_files_py type_check ( cd python ; \ uv run python -m tests.unit.main \ --env=test \ @@ -151,7 +151,7 @@ format: format_py format_scala format_md type_check: uv run mypy ${PYTHON_DIRS} --check-untyped-defs -lint_test: check_format assert_yaml_config_parse +lint_test: check_format assert_yaml_configs_parse @echo "Lint checks pass!" # compiles current working state of scala projects to local jars diff --git a/mypy.ini b/mypy.ini index 7259770b1..d488c2a83 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,6 @@ # Global options: [mypy] -python_version = 3.9 +python_version = 3.11 # Ignore modules that don't have any existing stubs diff --git a/python/gigl/common/utils/compute/serialization/serialize_protos.py b/python/gigl/common/utils/compute/serialization/serialize_protos.py index dac0ed7d2..97bfb09cb 100644 --- a/python/gigl/common/utils/compute/serialization/serialize_protos.py +++ b/python/gigl/common/utils/compute/serialization/serialize_protos.py @@ -8,10 +8,10 @@ from snapchat.research.gbml import graph_schema_pb2 """ -In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only -need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, -therefore can be deterministic. Which is specially important when shuffling with proto wrapper -objects as key. +In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only +need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, +therefore can be deterministic. Which is specially important when shuffling with proto wrapper +objects as key. """ diff --git a/python/gigl/distributed/utils/networking.py b/python/gigl/distributed/utils/networking.py index cf733c4e3..7d2ba46b9 100644 --- a/python/gigl/distributed/utils/networking.py +++ b/python/gigl/distributed/utils/networking.py @@ -155,7 +155,7 @@ def get_internal_ip_from_node( # Other nodes will receive the master's IP via broadcast ip_list = [None] - device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.distributed.broadcast_object_list(ip_list, src=node_rank, device=device) node_ip = ip_list[0] logger.info(f"Rank {rank} received master node's internal IP: {node_ip}") diff --git a/python/gigl/nn/models.py b/python/gigl/nn/models.py index 9fa29f62e..15adde632 100644 --- a/python/gigl/nn/models.py +++ b/python/gigl/nn/models.py @@ -397,16 +397,16 @@ def _weighted_layer_sum( Returns: torch.Tensor: Weighted sum of all layer embeddings, shape [N, D]. """ - if len(all_layer_embeddings) != len(self._layer_weights): + if len(all_layer_embeddings) != len(self._layer_weights): # type: ignore # https://github.com/Snapchat/GiGL/issues/408 raise ValueError( - f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." + f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." # type: ignore # https://github.com/Snapchat/GiGL/issues/408 ) # Stack all layer embeddings and compute weighted sum # _layer_weights is already a tensor buffer registered in __init__ stacked = torch.stack(all_layer_embeddings, dim=0) # shape [K+1, N, D] w = self._layer_weights.to(stacked.device) # shape [K+1], ensure on same device - out = (stacked * w.view(-1, 1, 1)).sum( + out = (stacked * w.view(-1, 1, 1)).sum( # type: ignore # https://github.com/Snapchat/GiGL/issues/408 dim=0 ) # shape [N, D], w_0*X_0 + w_1*X_1 + ... diff --git a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py index c91532cb2..9cae41eb8 100644 --- a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py +++ b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py @@ -174,9 +174,12 @@ def train( early_stop_counter = 0 best_val_loss = float("inf") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_training_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) @@ -411,9 +414,12 @@ def eval( logger.info("Start testing...") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_test_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) diff --git a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py index 5fa98ca95..e8c8c54d2 100644 --- a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py +++ b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py @@ -200,7 +200,7 @@ def score( assert root_node_labels is not None results: InferBatchResults = self.infer_batch(batch=batch, device=device) - num_correct_in_batch = int((results.predictions == root_node_labels).sum()) + num_correct_in_batch = int((results.predictions == root_node_labels).sum()) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 num_correct += num_correct_in_batch num_evaluated += len(batch.root_node_labels) diff --git a/python/gigl/src/common/modeling_task_specs/utils/infer.py b/python/gigl/src/common/modeling_task_specs/utils/infer.py index 0222feb28..13804bea8 100644 --- a/python/gigl/src/common/modeling_task_specs/utils/infer.py +++ b/python/gigl/src/common/modeling_task_specs/utils/infer.py @@ -139,8 +139,8 @@ def infer_task_inputs( decoder = model.module.decode batch_result_types = model.module.tasks.result_types else: - decoder = model.decode - batch_result_types = model.tasks.result_types + decoder = model.decode # type: ignore # https://github.com/Snapchat/GiGL/issues/408 + batch_result_types = model.tasks.result_types # type: ignore # https://github.com/Snapchat/GiGL/issues/408 # If we only have losses which only require the input batch, don't forward here and return the # input batch immediately to minimize computation we don't need, such as encoding and decoding. diff --git a/python/gigl/src/common/models/layers/feature_interaction.py b/python/gigl/src/common/models/layers/feature_interaction.py index aa7ad737f..afa025365 100644 --- a/python/gigl/src/common/models/layers/feature_interaction.py +++ b/python/gigl/src/common/models/layers/feature_interaction.py @@ -149,7 +149,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def reset_parameters(self): for layer in self._layers: - layer.reset_parameters() + if hasattr(layer, "reset_parameters") and callable(layer.reset_parameters): + layer.reset_parameters() def __repr__(self) -> str: return f"{self.__class__.__name__}(in_dim={self._in_dim}, num_layers={self._num_layers}, projection_dim={self._projection_dim}, diag_scale={self._diag_scale}, use_bias={self._use_bias})" diff --git a/python/gigl/src/common/models/layers/loss.py b/python/gigl/src/common/models/layers/loss.py index 958e0304a..b03c150e5 100644 --- a/python/gigl/src/common/models/layers/loss.py +++ b/python/gigl/src/common/models/layers/loss.py @@ -142,7 +142,7 @@ def _calculate_softmax_loss( ) # shape=[num_pos_nodes] loss = F.cross_entropy( - input=all_scores / self.softmax_temperature, + input=all_scores / self.softmax_temperature, # type: ignore # https://github.com/Snapchat/GiGL/issues/408 target=ys, reduction="sum", ) diff --git a/python/gigl/src/common/models/layers/task.py b/python/gigl/src/common/models/layers/task.py index 35f00aec2..b82dee44b 100644 --- a/python/gigl/src/common/models/layers/task.py +++ b/python/gigl/src/common/models/layers/task.py @@ -709,7 +709,7 @@ def _get_all_tasks( for task in list(self._task_to_weights_map.keys()): fn = self._task_to_fn_map[task] weight = self._task_to_weights_map[task] - tasks_list.append((fn, weight)) + tasks_list.append((fn, weight)) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 return tasks_list def add_task( diff --git a/python/gigl/src/mocking/lib/pyg_datasets_forks.py b/python/gigl/src/mocking/lib/pyg_datasets_forks.py index de026b61d..e83abfe0c 100644 --- a/python/gigl/src/mocking/lib/pyg_datasets_forks.py +++ b/python/gigl/src/mocking/lib/pyg_datasets_forks.py @@ -1,7 +1,7 @@ """ Our mocking logic uses public datasets like Cora and DBLP from PyG. PyG datasets are -downloaded from public sources which may not be available or rate-limit us. We thus -override the dataset classes to download the datasets from GCS buckets to avoid issues. +downloaded from public sources which may not be available or rate-limit us. We thus +override the dataset classes to download the datasets from GCS buckets to avoid issues. """ from torch_geometric.data import extract_zip diff --git a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py index 1f38ac487..880821876 100644 --- a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py +++ b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py @@ -94,7 +94,7 @@ def test_early_stopping( for step_num, value in enumerate(mocked_criteria_values): has_metric_improved, should_early_stop = early_stopper.step(value=value) if model is not None: - model.foo += 1 + model.foo += 1 # type: ignore # https://github.com/Snapchat/GiGL/issues/408 if step_num in improvement_steps: self.assertTrue(has_metric_improved) else: