diff --git a/.github/actions/setup-python-tools/action.yml b/.github/actions/setup-python-tools/action.yml index 9a7572a95..36e1b914c 100644 --- a/.github/actions/setup-python-tools/action.yml +++ b/.github/actions/setup-python-tools/action.yml @@ -37,28 +37,16 @@ runs: using: "composite" # needs: comment-link-to-workflow # Ensure that a comment is posted with workflow id steps: - # Step 1: Set up Python environment (Python 3.9.13). - - name: Set up Python - uses: actions/setup-python@v4 + # Step 1: Set up Python environment + - name: "Set up Python" + uses: actions/setup-python@v6 with: - # Available versions: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json - # Ensure to use a version that has support for arm64-darwin so we can build for Apple Silicon (macOS 14). - python-version: '3.9.13' + python-version-file: ".python-version" - # Step 2: Install pip-tools, which is used to generate hashed requirements. - # Note_1: pip 25.1 has a bug that causes pip-tools to fail with the following error: - # File ".../python3.9/site-packages/piptools/repositories/pypi.py", line 452, in allow_all_wheels - # self.finder.find_all_candidates.cache_clear() - # AttributeError: 'function' object has no attribute 'cache_clear' - # Note_2: Even though some wheels are guarded behind conditionals i.e. only use this if platform = linux; - # pip-tools 7.5.0 fails with the following error: - # pip._internal.exceptions.UnsupportedWheel: pyg_lib-0.4....linux_x86_64.whl is not a supported wheel on this platform. - # Thus, we fix the pip version to 25.0.1 and pip-tools version to 7.4.1. - - name: Install pip-tools - shell: bash - run: | - python -m pip install "pip==25.0.1" - python -m pip install "pip-tools==7.4.1" + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.9.5" # Matches the version in install_py_deps.sh # Step 3: Set up Gcloud AUTH using Workload Identity Federation # See following for context: https://cloud.google.com/blog/products/identity-security/enabling-keyless-authentication-from-github-actions diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml index 63ea9be26..91135a88e 100644 --- a/.github/cloud_builder/run_command_on_active_checkout.yaml +++ b/.github/cloud_builder/run_command_on_active_checkout.yaml @@ -3,7 +3,7 @@ substitutions: options: logging: CLOUD_LOGGING_ONLY steps: - - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 + - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 entrypoint: /bin/bash args: - -c @@ -18,15 +18,18 @@ steps: echo "Setting up environment..." # gcloud runner will run as a non-root user, but all paths/profiles, etc are set up for root + mkdir -p /builder/home/.local/bin + cp -r /root/.local/bin/ /builder/home/.local/ echo "source /root/.bashrc" >> ~/.bashrc echo "source /root/.profile" >> ~/.profile source ~/.profile + docker version docker buildx create --driver=docker-container --use docker run --rm --privileged multiarch/qemu-user-static --reset -p yes gcloud auth configure-docker us-central1-docker.pkg.dev # Install GiGL - pip install -e ./python/ + uv pip install -e . # The builder operates in its own user dir, usually /workspace, # so we need to copy the gigl tools dir to the current cloud_builder's user dir. # See: containers/Dockerfile.builder. diff --git a/.github/scripts/update_docker_image_refs.sh b/.github/scripts/update_docker_image_refs.sh new file mode 100644 index 000000000..815b1eee2 --- /dev/null +++ b/.github/scripts/update_docker_image_refs.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Script to update dep_vars.env and cloud builder config with new Docker image references + +set -e + +echo "Writing new image names to dep_vars.env:" +echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" +echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" +echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" +echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" + +sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env +sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env +sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml diff --git a/.github/workflows/build-base-docker-images.yml b/.github/workflows/build-base-docker-images.yml index 9d848ecad..faec56e8e 100644 --- a/.github/workflows/build-base-docker-images.yml +++ b/.github/workflows/build-base-docker-images.yml @@ -6,6 +6,7 @@ on: pr_number: description: 'PR to run the workflow on' required: true + env: DOCKER_BUILDKIT: 1 GIGL_BASE_CUDA_IMAGE: us-central1-docker.pkg.dev/${{ vars.GCP_PROJECT_ID }}/public-gigl/gigl-cuda-base:${{ github.sha }}.${{ github.run_number }}.${{ github.run_attempt }} @@ -16,6 +17,7 @@ env: jobs: comment-workflow-started: + runs-on: ubuntu-latest steps: - name: Comment on PR @@ -29,7 +31,7 @@ jobs: Once done, the workflow will update the `dep_vars.env` file with the new image names. build-cuda-base-image: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 8-cores, 32GB RAM, 300 GB SSD permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -41,7 +43,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -56,8 +58,8 @@ jobs: docker push ${GIGL_BASE_CUDA_IMAGE} echo "Pushed CUDA base image to ${GIGL_BASE_CUDA_IMAGE}" - build-cpu-base-images: - runs-on: gigl-large-instances # x64 Ubuntu:latest w/ 4 cores, 16GB RAM, 150 GB SSD + build-cpu-base-image: + runs-on: ubuntu-latest permissions: # Needed for gcloud auth: https://github.com/google-github-actions/auth contents: 'read' @@ -69,14 +71,13 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} - - name: Build and Push CPU Base Image and Docker CPU Image run: | gcloud auth configure-docker us-central1-docker.pkg.dev @@ -85,8 +86,30 @@ jobs: docker push ${GIGL_BASE_CPU_IMAGE} echo "Pushed CPU base image to ${GIGL_BASE_CPU_IMAGE}" - echo "Will use CPU image ${GIGL_BASE_CPU_IMAGE} as base image for Dataflow image." - docker build -f ./containers/Dockerfile.dataflow.base --build-arg BASE_IMAGE=${GIGL_BASE_CPU_IMAGE} -t ${GIGL_BASE_DATAFLOW_IMAGE} . + build-dataflow-base-image: + runs-on: ubuntu-latest + permissions: + # Needed for gcloud auth: https://github.com/google-github-actions/auth + contents: 'read' + id-token: 'write' + steps: + - name: Checkout PR Branch + uses: snapchat/gigl/.github/actions/checkout-pr-branch@main + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + pr_number: ${{ inputs.pr_number }} + - name: Setup Machine for building Docker images + uses: ./.github/actions/setup-python-tools + with: + setup_gcloud: "true" + try_cleaning_disk_space: "true" + gcp_project_id: ${{ vars.GCP_PROJECT_ID }} + workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} + gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + - name: Build and Push Dataflow Base Image + run: | + gcloud auth configure-docker us-central1-docker.pkg.dev + docker build -f ./containers/Dockerfile.dataflow.base -t ${GIGL_BASE_DATAFLOW_IMAGE} . docker push ${GIGL_BASE_DATAFLOW_IMAGE} echo "Pushed Dataflow base image to ${GIGL_BASE_DATAFLOW_IMAGE}" @@ -103,7 +126,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} - name: Setup Machine for building Docker images - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" try_cleaning_disk_space: "true" @@ -124,7 +147,8 @@ jobs: build-and-commit-base-images: needs: - build-cuda-base-image - - build-cpu-base-images + - build-cpu-base-image + - build-dataflow-base-image - build-builder-image runs-on: ubuntu-latest steps: @@ -134,23 +158,12 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ inputs.pr_number }} should_leave_progress_comments: "false" - command: | - echo "Writing new image names to dep_vars.env:" - echo " DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}" - echo " DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}" - echo " DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}" - echo " DOCKER_LATEST_BUILDER_IMAGE_NAME_WITH_TAG=${GIGL_BUILDER_IMAGE}" - sed -i "s|^DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CUDA_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=${GIGL_BASE_CPU_IMAGE}|" dep_vars.env - sed -i "s|^DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=.*|DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=${GIGL_BASE_DATAFLOW_IMAGE}|" dep_vars.env - sed -i "s|name: us-central1-docker\.pkg\.dev.*|name: ${GIGL_BUILDER_IMAGE}|" .github/cloud_builder/run_command_on_active_checkout.yaml - + command: bash .github/scripts/update_docker_image_refs.sh - name: Commit and Push Dep Vars uses: snapchat/gigl/.github/actions/commit-and-push@main with: commit_message: "[AUTOMATED] Update dep.vars, and other relevant files with new image names" github_token: ${{ secrets.GITHUB_TOKEN }} - - uses: snapchat/gigl/.github/actions/comment-on-pr@main with: pr_number: ${{ inputs.pr_number }} diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml index 4d6032747..cc57aed64 100644 --- a/.github/workflows/on-pr-comment.yml +++ b/.github/workflows/on-pr-comment.yml @@ -24,7 +24,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version-file: ".python-version" - name: Install PyYAML run: pip install PyYAML @@ -164,6 +164,7 @@ jobs: workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }} gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} command: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml index 75bfccd9d..0e1f9ddd0 100644 --- a/.github/workflows/on-pr-merge.yml +++ b/.github/workflows/on-pr-merge.yml @@ -26,7 +26,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} @@ -37,7 +37,7 @@ jobs: # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand # how to leverage Workload Identity Federation to read assets from GCS, et al. See: # https://github.com/tensorflow/tensorflow/issues/57104 - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make unit_test_py" service_account: ${{ secrets.gcp_service_account_email }} @@ -53,14 +53,18 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Scala Unit Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + # We use cloud run here instead of using github hosted runners because of limitation of tests + # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand + # how to leverage Workload Identity Federation to read assets from GCS, et al. See: + # https://github.com/tensorflow/tensorflow/issues/57104 + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make unit_test_scala" service_account: ${{ secrets.gcp_service_account_email }} @@ -72,14 +76,14 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Integration Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make integration_test" service_account: ${{ secrets.gcp_service_account_email }} @@ -91,14 +95,14 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: setup_gcloud: "true" gcp_project_id: ${{ vars.GCP_PROJECT_ID }} workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run E2E Tests - uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + uses: ./.github/actions/run-cloud-run-command-on-active-checkout with: cmd: "make run_all_e2e_tests" service_account: ${{ secrets.gcp_service_account_email }} @@ -111,14 +115,14 @@ jobs: # steps: # - uses: actions/checkout@v4 # - name: Setup development environment - # uses: snapchat/gigl/.github/actions/setup-python-tools@main + # uses: ./.github/actions/setup-python-tools # with: # setup_gcloud: "true" # gcp_project_id: ${{ vars.GCP_PROJECT_ID }} # workload_identity_provider: ${{ secrets.workload_identity_provider }} # gcp_service_account_email: ${{ secrets.gcp_service_account_email }} # - name: Run Example Notebook E2E Tests - # uses: snapchat/gigl/.github/actions/run-cloud-run-command-on-active-checkout@main + # uses: ./.github/actions/run-cloud-run-command-on-active-checkout # with: # cmd: "make notebooks_test" # service_account: ${{ secrets.gcp_service_account_email }} @@ -130,7 +134,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup development environment - uses: snapchat/gigl/.github/actions/setup-python-tools@main + uses: ./.github/actions/setup-python-tools with: install_dev_deps: "true" setup_gcloud: "true" @@ -138,7 +142,9 @@ jobs: workload_identity_provider: ${{ secrets.workload_identity_provider }} gcp_service_account_email: ${{ secrets.gcp_service_account_email }} - name: Run Lint Tests + shell: bash run: | + # sourcing .profile is important to resolve paths for java, sbt, et al. + # It is setup in the setup-python-tools action. source ~/.profile - make check_format - make assert_yaml_configs_parse + make lint_test diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index 5e9345b61..bc25d47ba 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -44,7 +44,7 @@ jobs: # We also make gigl available w/ editable install `-e` so that autodoc can find it. - name: Install necessary doc dependencies run: | - pip install -e "./python[docs]" + uv sync --extra docs - name: Sphinx build run: | make build_docs diff --git a/Makefile b/Makefile index aa06a1155..8d11a4e50 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ format: format_py format_scala format_md type_check: uv run mypy ${PYTHON_DIRS} --check-untyped-defs -lint_test: check_format assert_yaml_config_parse +lint_test: check_format assert_yaml_configs_parse @echo "Lint checks pass!" # compiles current working state of scala projects to local jars diff --git a/dep_vars.env b/dep_vars.env index 09af0820d..784ed142f 100644 --- a/dep_vars.env +++ b/dep_vars.env @@ -1,7 +1,7 @@ # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing. -DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 -DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:6a94ae7cad3ec0c633246b0c9340a5095527deb9.63.2 +DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 +DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:51af343c1c298ab465a96ecffd4e50ea6dffacb7.88.1 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.0.11 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.0.11 diff --git a/deployment/configs/e2e_cicd_resource_config.yaml b/deployment/configs/e2e_cicd_resource_config.yaml index ec6fa9540..21fe6ab0a 100644 --- a/deployment/configs/e2e_cicd_resource_config.yaml +++ b/deployment/configs/e2e_cicd_resource_config.yaml @@ -44,7 +44,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/deployment/configs/unittest_resource_config.yaml b/deployment/configs/unittest_resource_config.yaml index 2b6666ec3..c9335d81d 100644 --- a/deployment/configs/unittest_resource_config.yaml +++ b/deployment/configs/unittest_resource_config.yaml @@ -46,7 +46,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training + gpu_type: NVIDIA_TESLA_T4 # set to `ACCELERATOR_TYPE_UNSPECIFIED` for cpu training gpu_limit: 1 # set to 0 for cpu training num_replicas: 2 inferencer_config: diff --git a/docs/examples/configs/template_resource_config.yaml b/docs/examples/configs/template_resource_config.yaml index f8335b501..f87d315f8 100644 --- a/docs/examples/configs/template_resource_config.yaml +++ b/docs/examples/configs/template_resource_config.yaml @@ -31,11 +31,11 @@ split_generator_config: # Dataproc config trainer_config: vertex_ai_trainer_config: # or local_trainer_config machine_type: "n1-highmem-8" - gpu_type: nvidia-tesla-p100 + gpu_type: NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 2 inferencer_config: num_workers: 1 max_num_workers: 256 machine_type: "c3-standard-22" - disk_size_gb: 100 \ No newline at end of file + disk_size_gb: 100 diff --git a/docs/user_guide/config_guides/resource_config_guide.md b/docs/user_guide/config_guides/resource_config_guide.md index d909650c4..decc23df1 100644 --- a/docs/user_guide/config_guides/resource_config_guide.md +++ b/docs/user_guide/config_guides/resource_config_guide.md @@ -54,7 +54,7 @@ split_generator_config: trainer_config: vertex_ai_trainer_config: machine_type: "" # e.g. n1-highmem-16 - gpu_type: "" # e.g. nvidia-tesla-p100 + gpu_type: "" # e.g. NVIDIA_TESLA_T4 gpu_limit: 1 num_replicas: 1 inferencer_config: diff --git a/mypy.ini b/mypy.ini index 7259770b1..d488c2a83 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,6 @@ # Global options: [mypy] -python_version = 3.9 +python_version = 3.11 # Ignore modules that don't have any existing stubs diff --git a/python/gigl/common/utils/compute/serialization/serialize_protos.py b/python/gigl/common/utils/compute/serialization/serialize_protos.py index dac0ed7d2..97bfb09cb 100644 --- a/python/gigl/common/utils/compute/serialization/serialize_protos.py +++ b/python/gigl/common/utils/compute/serialization/serialize_protos.py @@ -8,10 +8,10 @@ from snapchat.research.gbml import graph_schema_pb2 """ -In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only -need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, -therefore can be deterministic. Which is specially important when shuffling with proto wrapper -objects as key. +In dataflow, we use wrapper object as key, value beam DoFn outputs and also for shuffle. We only +need to serialize the proto itself and not the wrapper. The proto objects also do not contain Map, +therefore can be deterministic. Which is specially important when shuffling with proto wrapper +objects as key. """ diff --git a/python/gigl/distributed/utils/networking.py b/python/gigl/distributed/utils/networking.py index cf733c4e3..7d2ba46b9 100644 --- a/python/gigl/distributed/utils/networking.py +++ b/python/gigl/distributed/utils/networking.py @@ -155,7 +155,7 @@ def get_internal_ip_from_node( # Other nodes will receive the master's IP via broadcast ip_list = [None] - device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.distributed.broadcast_object_list(ip_list, src=node_rank, device=device) node_ip = ip_list[0] logger.info(f"Rank {rank} received master node's internal IP: {node_ip}") diff --git a/python/gigl/nn/models.py b/python/gigl/nn/models.py index 9fa29f62e..15adde632 100644 --- a/python/gigl/nn/models.py +++ b/python/gigl/nn/models.py @@ -397,16 +397,16 @@ def _weighted_layer_sum( Returns: torch.Tensor: Weighted sum of all layer embeddings, shape [N, D]. """ - if len(all_layer_embeddings) != len(self._layer_weights): + if len(all_layer_embeddings) != len(self._layer_weights): # type: ignore # https://github.com/Snapchat/GiGL/issues/408 raise ValueError( - f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." + f"Got {len(all_layer_embeddings)} layer tensors but {len(self._layer_weights)} weights." # type: ignore # https://github.com/Snapchat/GiGL/issues/408 ) # Stack all layer embeddings and compute weighted sum # _layer_weights is already a tensor buffer registered in __init__ stacked = torch.stack(all_layer_embeddings, dim=0) # shape [K+1, N, D] w = self._layer_weights.to(stacked.device) # shape [K+1], ensure on same device - out = (stacked * w.view(-1, 1, 1)).sum( + out = (stacked * w.view(-1, 1, 1)).sum( # type: ignore # https://github.com/Snapchat/GiGL/issues/408 dim=0 ) # shape [N, D], w_0*X_0 + w_1*X_1 + ... diff --git a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py index c91532cb2..9cae41eb8 100644 --- a/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py +++ b/python/gigl/src/common/modeling_task_specs/graphsage_template_modeling_spec.py @@ -174,9 +174,12 @@ def train( early_stop_counter = 0 best_val_loss = float("inf") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_training_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) @@ -411,9 +414,12 @@ def eval( logger.info("Start testing...") + assert hasattr(self.model, "graph_backend") + assert isinstance(self.model.graph_backend, GraphBackend) + graph_backend = self.model.graph_backend data_loaders: Dataloaders = self._dataloaders.get_test_dataloaders( gbml_config_pb_wrapper=gbml_config_pb_wrapper, - graph_backend=self.model.graph_backend, + graph_backend=graph_backend, device=device, ) diff --git a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py index 5fa98ca95..e8c8c54d2 100644 --- a/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py +++ b/python/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py @@ -200,7 +200,7 @@ def score( assert root_node_labels is not None results: InferBatchResults = self.infer_batch(batch=batch, device=device) - num_correct_in_batch = int((results.predictions == root_node_labels).sum()) + num_correct_in_batch = int((results.predictions == root_node_labels).sum()) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 num_correct += num_correct_in_batch num_evaluated += len(batch.root_node_labels) diff --git a/python/gigl/src/common/modeling_task_specs/utils/infer.py b/python/gigl/src/common/modeling_task_specs/utils/infer.py index 0222feb28..13804bea8 100644 --- a/python/gigl/src/common/modeling_task_specs/utils/infer.py +++ b/python/gigl/src/common/modeling_task_specs/utils/infer.py @@ -139,8 +139,8 @@ def infer_task_inputs( decoder = model.module.decode batch_result_types = model.module.tasks.result_types else: - decoder = model.decode - batch_result_types = model.tasks.result_types + decoder = model.decode # type: ignore # https://github.com/Snapchat/GiGL/issues/408 + batch_result_types = model.tasks.result_types # type: ignore # https://github.com/Snapchat/GiGL/issues/408 # If we only have losses which only require the input batch, don't forward here and return the # input batch immediately to minimize computation we don't need, such as encoding and decoding. diff --git a/python/gigl/src/common/models/layers/feature_interaction.py b/python/gigl/src/common/models/layers/feature_interaction.py index aa7ad737f..afa025365 100644 --- a/python/gigl/src/common/models/layers/feature_interaction.py +++ b/python/gigl/src/common/models/layers/feature_interaction.py @@ -149,7 +149,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def reset_parameters(self): for layer in self._layers: - layer.reset_parameters() + if hasattr(layer, "reset_parameters") and callable(layer.reset_parameters): + layer.reset_parameters() def __repr__(self) -> str: return f"{self.__class__.__name__}(in_dim={self._in_dim}, num_layers={self._num_layers}, projection_dim={self._projection_dim}, diag_scale={self._diag_scale}, use_bias={self._use_bias})" diff --git a/python/gigl/src/common/models/layers/loss.py b/python/gigl/src/common/models/layers/loss.py index 958e0304a..b03c150e5 100644 --- a/python/gigl/src/common/models/layers/loss.py +++ b/python/gigl/src/common/models/layers/loss.py @@ -142,7 +142,7 @@ def _calculate_softmax_loss( ) # shape=[num_pos_nodes] loss = F.cross_entropy( - input=all_scores / self.softmax_temperature, + input=all_scores / self.softmax_temperature, # type: ignore # https://github.com/Snapchat/GiGL/issues/408 target=ys, reduction="sum", ) diff --git a/python/gigl/src/common/models/layers/task.py b/python/gigl/src/common/models/layers/task.py index 35f00aec2..b82dee44b 100644 --- a/python/gigl/src/common/models/layers/task.py +++ b/python/gigl/src/common/models/layers/task.py @@ -709,7 +709,7 @@ def _get_all_tasks( for task in list(self._task_to_weights_map.keys()): fn = self._task_to_fn_map[task] weight = self._task_to_weights_map[task] - tasks_list.append((fn, weight)) + tasks_list.append((fn, weight)) # type: ignore # https://github.com/Snapchat/GiGL/issues/408 return tasks_list def add_task( diff --git a/python/gigl/src/mocking/lib/pyg_datasets_forks.py b/python/gigl/src/mocking/lib/pyg_datasets_forks.py index de026b61d..e83abfe0c 100644 --- a/python/gigl/src/mocking/lib/pyg_datasets_forks.py +++ b/python/gigl/src/mocking/lib/pyg_datasets_forks.py @@ -1,7 +1,7 @@ """ Our mocking logic uses public datasets like Cora and DBLP from PyG. PyG datasets are -downloaded from public sources which may not be available or rate-limit us. We thus -override the dataset classes to download the datasets from GCS buckets to avoid issues. +downloaded from public sources which may not be available or rate-limit us. We thus +override the dataset classes to download the datasets from GCS buckets to avoid issues. """ from torch_geometric.data import extract_zip diff --git a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py index 1f38ac487..880821876 100644 --- a/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py +++ b/python/tests/unit/src/common/modeling_task_spec_utils/early_stop_test.py @@ -94,7 +94,7 @@ def test_early_stopping( for step_num, value in enumerate(mocked_criteria_values): has_metric_improved, should_early_stop = early_stopper.step(value=value) if model is not None: - model.foo += 1 + model.foo += 1 # type: ignore # https://github.com/Snapchat/GiGL/issues/408 if step_num in improvement_steps: self.assertTrue(has_metric_improved) else: diff --git a/testing/e2e_tests/e2e_tests.yaml b/testing/e2e_tests/e2e_tests.yaml index b084b9479..44b4445f0 100644 --- a/testing/e2e_tests/e2e_tests.yaml +++ b/testing/e2e_tests/e2e_tests.yaml @@ -2,16 +2,16 @@ # This file contains all the test specifications that can be run via the e2e test script tests: cora_nalp_test: - task_config_uri: "gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_snc_test: - task_config_uri: "gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_supervised_node_classification_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" cora_udl_test: - task_config_uri: "gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/e2e_udl_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" dblp_nalp_test: - task_config_uri: "gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" + task_config_uri: "python/gigl/src/mocking/configs/dblp_node_anchor_based_link_prediction_template_gbml_config.yaml" resource_config_uri: "${oc.env:GIGL_TEST_DEFAULT_RESOURCE_CONFIG,deployment/configs/e2e_cicd_resource_config.yaml}" hom_cora_sup_test: task_config_uri: "examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml" diff --git a/uv.lock b/uv.lock index e3df8684b..10f25efd8 100644 --- a/uv.lock +++ b/uv.lock @@ -893,7 +893,7 @@ requires-dist = [ { name = "numpy" }, { name = "omegaconf", specifier = ">=2.3.0,<3.0.0" }, { name = "pandas" }, - { name = "pip" }, + { name = "pip", specifier = "~=25.3" }, { name = "protobuf" }, { name = "pyarrow", marker = "extra == 'transform'", specifier = "==10.0.1" }, { name = "pyg-lib", marker = "sys_platform != 'darwin' and extra == 'pyg27-torch28-cpu'", index = "https://data.pyg.org/whl/torch-2.8.0+cpu.html", conflict = { package = "gigl", extra = "pyg27-torch28-cpu" } },