diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..6005bb6bf7ae 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -1,6 +1,6 @@ [dev] # Set to "huggingface", for example, if you are a huggingface developer. Default is "" -partner_developer = "" +partner_developer = "huggingface" # Please only set it to true if you are preparing an EI related PR # Do remember to revert it back to false before merging any PR (including EI dedicated PR) ei_mode = false @@ -36,8 +36,8 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +build_frameworks = ["huggingface_vllm"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -186,5 +186,8 @@ dlc-pr-tensorflow-2-eia-inference = "" # vllm dlc-pr-vllm = "" +# HuggingFace vLLM +dlc-pr-huggingface-vllm = "" + # sglang dlc-pr-sglang = "" \ No newline at end of file diff --git a/huggingface/hf-vllm/buildspec.yml b/huggingface/hf-vllm/buildspec.yml index e69de29bb2d1..7b7305a723aa 100644 --- a/huggingface/hf-vllm/buildspec.yml +++ b/huggingface/hf-vllm/buildspec.yml @@ -0,0 +1,52 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +base_framework: &BASE_FRAMEWORK vllm +framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] +version: &VERSION "0.12.0" +short_version: &SHORT_VERSION "0.12" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + build_repository: &BUILD_REPOSITORY + image_type: &IMAGE_TYPE gpu + root: huggingface/hf-vllm + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + build_context: &BUILD_CONTEXT + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + cuda-compatibility-lib: + source: ../build_artifacts/inference/cuda-compatibility-lib.sh + target: cuda-compatibility-lib.sh + + +images: + BuildHuggingFaceVllmGpuPy312Cu129DockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 26000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu129 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + vllm_version: &VLLM_VERSION 0.12.0 + tag: !join [ "hf-vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ "hf-vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker diff --git a/huggingface/hf-vllm/docker/0.12/cu129/Dockerfile b/huggingface/hf-vllm/docker/0.12/cu129/Dockerfile new file mode 100644 index 000000000000..b81fcbeeaade --- /dev/null +++ b/huggingface/hf-vllm/docker/0.12/cu129/Dockerfile @@ -0,0 +1,42 @@ +ARG FINAL_BASE_IMAGE=763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker-v1.0 +FROM ${FINAL_BASE_IMAGE} AS vllm-base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG HUGGINGFACE_HUB_VERSION=0.36.0 +ARG HF_XET_VERSION=1.2.0 + +RUN apt-get update -y \ +&& apt-get install -y --no-install-recommends curl unzip \ +&& rm -rf /var/lib/apt/lists/* + + +RUN pip install --upgrade pip && \ + pip install --no-cache-dir \ + huggingface-hub==${HUGGINGFACE_HUB_VERSION} \ + hf-xet==${HF_XET_VERSION} \ + grpcio + + +FROM vllm-base AS sagemaker +ENV HF_HUB_ENABLE_HF_TRANSFER="1" \ + HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:inference:hf-vllm" + +COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh +RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh + +RUN set -eux; \ + HOME_DIR=/root; \ + uv pip install --system --upgrade pip requests PTable; \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip; \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/; \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance; \ + chmod +x /usr/local/bin/testOSSCompliance; \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh; \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python3; \ + rm -rf ${HOME_DIR}/oss_compliance* + + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] + diff --git a/huggingface/hf-vllm/out.py b/huggingface/hf-vllm/out.py new file mode 100644 index 000000000000..cf44f88fa683 --- /dev/null +++ b/huggingface/hf-vllm/out.py @@ -0,0 +1,14 @@ +import os + +try: + if os.path.exists("/usr/local/bin/deep_learning_container.py") and ( + os.getenv("OPT_OUT_TRACKING") is None or os.getenv("OPT_OUT_TRACKING", "").lower() != "true" + ): + import threading + + cmd = "python /usr/local/bin/deep_learning_container.py --framework huggingface_pytorch --framework-version 2.7.1 --container-type training &>/dev/null" + x = threading.Thread(target=lambda: os.system(cmd)) + x.setDaemon(True) + x.start() +except Exception: + pass diff --git a/huggingface/hf-vllm/telemetry.sh b/huggingface/hf-vllm/telemetry.sh new file mode 100644 index 000000000000..c01514033dd9 --- /dev/null +++ b/huggingface/hf-vllm/telemetry.sh @@ -0,0 +1,12 @@ +# telemetry.sh +#!/bin/bash +if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then + ( + python /usr/local/bin/deep_learning_container.py \ + --framework "hf-vllm" \ + --framework-version "0.12.0" \ + --container-type "inference" \ + &>/dev/null & + ) +fi + diff --git a/src/constants.py b/src/constants.py index 73f07931c2be..bb4baa4385c3 100644 --- a/src/constants.py +++ b/src/constants.py @@ -27,6 +27,7 @@ "base", "vllm", "sglang", + "huggingface_vllm", } DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"} IMAGE_TYPES = {"training", "inference"} diff --git a/test/sagemaker_tests/huggingface/hf-vllm/__init__.py b/test/sagemaker_tests/huggingface/hf-vllm/__init__.py new file mode 100644 index 000000000000..199e66b95926 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import diff --git a/test/sagemaker_tests/huggingface/hf-vllm/conftest.py b/test/sagemaker_tests/huggingface/hf-vllm/conftest.py new file mode 100644 index 000000000000..20daa2c701fd --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/conftest.py @@ -0,0 +1,393 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import os +import platform +import shutil +import sys +import tempfile + +import boto3 +import pytest + +from botocore.exceptions import ClientError +from sagemaker import LocalSession, Session +from sagemaker.pytorch import PyTorch + +from .utils import image_utils, get_ecr_registry + +NO_P4_REGIONS = [ + "af-south-1", + "ap-east-1", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-2", + "eu-west-3", + "eu-south-1", + "me-south-1", + "sa-east-1", + "us-west-1", + "cn-northwest-1", + "il-central-1", +] + +NO_G5_REGIONS = [ + "us-west-1", + "ca-west-1", + "mx-cental-1", + "af-south-1", + "ap-east-1", + "ap-south-2", + "ap-southeast-5", + "ap-southeast-4", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-7", + "eu-south-1", + "eu-west-3", + "eu-south-2", + "eu-central-2", + "me-south-1", +] + + +logger = logging.getLogger(__name__) +logging.getLogger("boto").setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("factory.py").setLevel(logging.INFO) +logging.getLogger("auth.py").setLevel(logging.INFO) +logging.getLogger("connectionpool.py").setLevel(logging.INFO) + + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def pytest_addoption(parser): + parser.addoption("--build-image", "-D", action="store_true") + parser.addoption("--build-base-image", "-B", action="store_true") + parser.addoption("--aws-id") + parser.addoption("--instance-type") + parser.addoption("--accelerator-type", default=None) + parser.addoption("--docker-base-name", default="huggingface_pytorch") + parser.addoption("--region", default="us-west-2") + parser.addoption("--framework-version", default="") + parser.addoption( + "--py-version", + choices=["2", "3", "37", "38", "39", "310", "311", "312"], + default=str(sys.version_info.major), + ) + # Processor is still "cpu" for EIA tests + parser.addoption( + "--processor", choices=["gpu", "cpu", "eia", "neuron", "neuronx"], default="cpu" + ) + # If not specified, will default to {framework-version}-{processor}-py{py-version} + parser.addoption("--tag", default=None) + parser.addoption( + "--generate-coverage-doc", + default=False, + action="store_true", + help="use this option to generate test coverage doc", + ) + parser.addoption( + "--efa", + action="store_true", + default=False, + help="Run only efa tests", + ) + parser.addoption("--sagemaker-regions", default="us-west-2") + + +def pytest_configure(config): + config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests") + + +def pytest_runtest_setup(item): + if item.config.getoption("--efa"): + efa_tests = [mark for mark in item.iter_markers(name="efa")] + if not efa_tests: + pytest.skip("Skipping non-efa tests") + + +def pytest_collection_modifyitems(session, config, items): + for item in items: + print(f"item {item}") + for marker in item.iter_markers(name="team"): + print(f"item {marker}") + team_name = marker.args[0] + item.user_properties.append(("team_marker", team_name)) + print(f"item.user_properties {item.user_properties}") + + if config.getoption("--generate-coverage-doc"): + from test.test_utils.test_reporting import TestReportGenerator + + report_generator = TestReportGenerator(items, is_sagemaker=True) + report_generator.generate_coverage_doc( + framework="huggingface_pytorch", job_type="inference" + ) + + +@pytest.fixture(scope="session", name="docker_base_name") +def fixture_docker_base_name(request): + return request.config.getoption("--docker-base-name") + + +@pytest.fixture(scope="session", name="region") +def fixture_region(request): + return request.config.getoption("--region") + + +@pytest.fixture(scope="session", name="framework_version") +def fixture_framework_version(request): + return request.config.getoption("--framework-version") + + +@pytest.fixture(scope="session", name="py_version") +def fixture_py_version(request): + return "py{}".format(int(request.config.getoption("--py-version"))) + + +@pytest.fixture(scope="session", name="processor") +def fixture_processor(request): + return request.config.getoption("--processor") + + +@pytest.fixture(scope="session", name="tag") +def fixture_tag(request, framework_version, processor, py_version): + provided_tag = request.config.getoption("--tag") + default_tag = "{}-{}-{}".format(framework_version, processor, py_version) + return provided_tag if provided_tag else default_tag + + +@pytest.fixture(scope="session", name="docker_image") +def fixture_docker_image(docker_base_name, tag): + return "{}:{}".format(docker_base_name, tag) + + +@pytest.fixture +def opt_ml(): + tmp = tempfile.mkdtemp() + os.mkdir(os.path.join(tmp, "output")) + + # Docker cannot mount Mac OS /var folder properly see + # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600 + opt_ml_dir = "/private{}".format(tmp) if platform.system() == "Darwin" else tmp + yield opt_ml_dir + + shutil.rmtree(tmp, True) + + +@pytest.fixture(scope="session", name="use_gpu") +def fixture_use_gpu(processor): + return processor == "gpu" + + +@pytest.fixture(scope="session", name="build_base_image", autouse=True) +def fixture_build_base_image( + request, framework_version, py_version, processor, tag, docker_base_name +): + build_base_image = request.config.getoption("--build-base-image") + if build_base_image: + return image_utils.build_base_image( + framework_name=docker_base_name, + framework_version=framework_version, + py_version=py_version, + base_image_tag=tag, + processor=processor, + cwd=os.path.join(dir_path, ".."), + ) + + return tag + + +@pytest.fixture(scope="session", name="sagemaker_session") +def fixture_sagemaker_session(region): + return Session(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(scope="session", name="sagemaker_regions") +def fixture_sagemaker_regions(request): + sagemaker_regions = request.config.getoption("--sagemaker-regions") + return sagemaker_regions.split(",") + + +@pytest.fixture(scope="session", name="sagemaker_local_session") +def fixture_sagemaker_local_session(region): + return LocalSession(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(name="aws_id", scope="session") +def fixture_aws_id(request): + return request.config.getoption("--aws-id") + + +@pytest.fixture(name="instance_type", scope="session") +def fixture_instance_type(request, processor): + provided_instance_type = request.config.getoption("--instance-type") + default_instance_type = "local" if processor == "cpu" else "local_gpu" + return provided_instance_type or default_instance_type + + +@pytest.fixture(name="accelerator_type", scope="session") +def fixture_accelerator_type(request): + return request.config.getoption("--accelerator-type") + + +@pytest.fixture(name="docker_registry", scope="session") +def fixture_docker_registry(aws_id, region): + return get_ecr_registry(aws_id, region) + + +@pytest.fixture(name="ecr_image", scope="session") +def fixture_ecr_image(docker_registry, docker_base_name, tag): + return "{}/{}:{}".format(docker_registry, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_by_device_type(request, use_gpu, instance_type, accelerator_type): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + is_eia = accelerator_type is not None + is_neuron = instance_type.startswith("ml.inf1") + is_neuronx = instance_type.startswith("ml.inf2") or instance_type.startswith("ml.trn1") + + # Separate out cases for clearer logic. + # When running Neuron test, skip CPU and GPU test. + if request.node.get_closest_marker("neuron_test") and not is_neuron: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + elif request.node.get_closest_marker("neuronx_test") and not is_neuronx: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running GPU test, skip CPU and neuron test. When running CPU test, skip GPU and neuron test. + elif (request.node.get_closest_marker("gpu_test") and not is_gpu) or ( + request.node.get_closest_marker("cpu_test") and (is_gpu or is_neuron or is_neuronx) + ): + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running EIA test, skip the CPU, GPU and Neuron functions + elif ( + request.node.get_closest_marker("neuron_test") + or request.node.get_closest_marker("gpu_test") + or request.node.get_closest_marker("cpu_test") + ) and is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running CPU or GPU or Neuron test, skip EIA test. + elif request.node.get_closest_marker("eia_test") and not is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + +@pytest.fixture(autouse=True) +def skip_by_py_version(request, py_version): + if request.node.get_closest_marker("skip_py2") and py_version != "py3": + pytest.skip("Skipping the test because Python 2 is not supported.") + + +@pytest.fixture(autouse=True) +def skip_gpu_instance_restricted_regions(region, instance_type): + if (region in NO_P4_REGIONS and instance_type.startswith("ml.p4")) or ( + region in NO_G5_REGIONS and instance_type.startswith("ml.g5") + ): + pytest.skip( + "Skipping GPU test in region {} with instance type {}".format(region, instance_type) + ) + + +@pytest.fixture(autouse=True) +def skip_gpu_py2(request, use_gpu, instance_type, py_version, framework_version): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + if ( + request.node.get_closest_marker("skip_gpu_py2") + and is_gpu + and py_version != "py3" + and framework_version == "1.4.0" + ): + pytest.skip("Skipping the test until mms issue resolved.") + + +def _get_remote_override_flags(): + try: + s3_client = boto3.client("s3") + sts_client = boto3.client("sts") + account_id = sts_client.get_caller_identity().get("Account") + result = s3_client.get_object( + Bucket=f"dlc-cicd-helper-{account_id}", Key="override_tests_flags.json" + ) + json_content = json.loads(result["Body"].read().decode("utf-8")) + except ClientError as e: + logger.warning("ClientError when performing S3/STS operation: {}".format(e)) + json_content = {} + return json_content + + +def _is_test_disabled(test_name, build_name, version): + """ + Expected format of remote_override_flags: + { + "CB Project Name for Test Type A": { + "CodeBuild Resolved Source Version": ["test_type_A_test_function_1", "test_type_A_test_function_2"] + }, + "CB Project Name for Test Type B": { + "CodeBuild Resolved Source Version": ["test_type_B_test_function_1", "test_type_B_test_function_2"] + } + } + + :param test_name: str Test Function node name (includes parametrized values in string) + :param build_name: str Build Project name of current execution + :param version: str Source Version of current execution + :return: bool True if test is disabled as per remote override, False otherwise + """ + remote_override_flags = _get_remote_override_flags() + remote_override_build = remote_override_flags.get(build_name, {}) + if version in remote_override_build: + return not remote_override_build[version] or any( + [test_keyword in test_name for test_keyword in remote_override_build[version]] + ) + return False + + +@pytest.fixture(autouse=True) +def disable_test(request): + test_name = request.node.name + # We do not have a regex pattern to find CB name, which means we must resort to string splitting + build_arn = os.getenv("CODEBUILD_BUILD_ARN") + build_name = build_arn.split("/")[-1].split(":")[0] if build_arn else None + version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") + + if build_name and version and _is_test_disabled(test_name, build_name, version): + pytest.skip(f"Skipping {test_name} test because it has been disabled.") + + +@pytest.fixture(autouse=True) +def skip_test_successfully_executed_before(request): + """ + "cache/lastfailed" contains information about failed tests only. We're running SM tests in separate threads for each image. + So when we retry SM tests, successfully executed tests executed again because pytest doesn't have that info in /.cache. + But the flag "--last-failed-no-failures all" requires pytest to execute all the available tests. + The only sign that a test passed last time - lastfailed file exists and the test name isn't in that file. + The method checks whether lastfailed file exists and the test name is not in it. + """ + test_name = request.node.name + lastfailed = request.config.cache.get("cache/lastfailed", None) + + if lastfailed is not None and not any( + test_name in failed_test_name for failed_test_name in lastfailed.keys() + ): + pytest.skip(f"Skipping {test_name} because it was successfully executed for this commit") diff --git a/test/sagemaker_tests/huggingface/hf-vllm/integration/__init__.py b/test/sagemaker_tests/huggingface/hf-vllm/integration/__init__.py new file mode 100644 index 000000000000..f9d5c7746fcb --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/integration/__init__.py @@ -0,0 +1,62 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import re + +import boto3 + + +ROLE = "SageMakerRole" +DEFAULT_TIMEOUT = 45 + + +class NoLogStreamFoundError(Exception): + pass + + +class SageMakerEndpointFailure(Exception): + pass + + +def dump_logs_from_cloudwatch(e, region="us-west-2"): + """ + Function to dump logs from cloudwatch during error handling + """ + error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):") + endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)") + endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search( + str(e) + ) + if endpoint_match: + logs_client = boto3.client("logs", region_name=region) + endpoint = endpoint_match.group(1) + log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}" + log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name) + all_traffic_log_stream = "" + for log_stream in log_stream_resp.get("logStreams", []): + log_stream_name = log_stream.get("logStreamName") + if log_stream_name.startswith("AllTraffic"): + all_traffic_log_stream = log_stream_name + break + if not all_traffic_log_stream: + raise NoLogStreamFoundError( + f"Cannot find all traffic log streams for endpoint {endpoint}" + ) from e + events = logs_client.get_log_events( + logGroupName=log_group_name, logStreamName=all_traffic_log_stream + ) + raise SageMakerEndpointFailure( + f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}" + ) from e diff --git a/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/__init__.py b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/__init__.py new file mode 100644 index 000000000000..04fbf5d9a144 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. diff --git a/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/test_vllm.py b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/test_vllm.py new file mode 100644 index 000000000000..8aa8a4ce2775 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/test_vllm.py @@ -0,0 +1,112 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging + +import pytest +import sagemaker +from sagemaker.huggingface import HuggingFaceModel +from sagemaker.serializers import JSONSerializer +from sagemaker.deserializers import JSONDeserializer + +from ...integration import dump_logs_from_cloudwatch +from ...integration.sagemaker.timeout import timeout_and_delete_endpoint +from ..... import invoke_sm_endpoint_helper_function + +LOGGER = logging.getLogger(__name__) + + +@pytest.mark.model("bloom-560m") +@pytest.mark.processor("gpu") +@pytest.mark.gpu_test +@pytest.mark.team("sagemaker-1p-algorithms") +def test_vllm_bloom(framework_version, ecr_image, instance_type, sagemaker_regions): + instance_type = instance_type or "ml.g6.12xlarge" + invoke_sm_endpoint_helper_function( + ecr_image=ecr_image, + sagemaker_regions=sagemaker_regions, + test_function=_test_vllm_model, + framework_version=framework_version, + instance_type=instance_type, + model_id="bigscience/bloom-560m", + dump_logs_from_cloudwatch=dump_logs_from_cloudwatch, + ) + + +@pytest.mark.model("qwen3-8b") +@pytest.mark.processor("gpu") +@pytest.mark.gpu_test +@pytest.mark.team("sagemaker-1p-algorithms") +def test_vllm_qwen(framework_version, ecr_image, instance_type, sagemaker_regions): + instance_type = instance_type or "ml.g6.12xlarge" + invoke_sm_endpoint_helper_function( + ecr_image=ecr_image, + sagemaker_regions=sagemaker_regions, + test_function=_test_vllm_model, + framework_version=framework_version, + instance_type=instance_type, + model_id="Qwen/Qwen3-8B", + dump_logs_from_cloudwatch=dump_logs_from_cloudwatch, + ) + + +def _test_vllm_model( + sagemaker_session, + framework_version, + image_uri, + instance_type, + model_id, + accelerator_type=None, + **kwargs, +): + """Test vLLM model deployment and inference using OpenAI-compatible API format""" + endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-vllm-serving") + + env = { + "HF_MODEL_ID": model_id, + "SM_NUM_GPUS": "4", + "SM_VLLM_MAX_MODEL_LEN": "512", + } + + hf_model = HuggingFaceModel( + env=env, + role="SageMakerRole", + image_uri=image_uri, + sagemaker_session=sagemaker_session, + ) + + with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=45): + predictor = hf_model.deploy( + initial_instance_count=1, + instance_type=instance_type, + endpoint_name=endpoint_name, + container_startup_health_check_timeout=1800, + ) + + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + # vLLM uses OpenAI-compatible API format + data = { + "prompt": "What is Deep Learning?", + "max_tokens": 50, + "temperature": 0.7, + } + + LOGGER.info(f"Running inference with data: {data}") + output = predictor.predict(data) + LOGGER.info(f"Output: {json.dumps(output)}") + + assert output is not None diff --git a/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/timeout.py b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/timeout.py new file mode 100644 index 000000000000..1d13878031f7 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/integration/sagemaker/timeout.py @@ -0,0 +1,66 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +import signal +from contextlib import contextmanager +import logging + +from botocore.exceptions import ClientError + +LOGGER = logging.getLogger("timeout") + + +class TimeoutError(Exception): + pass + + +@contextmanager +def timeout(seconds=0, minutes=0, hours=0): + """Add a signal-based timeout to any block of code. + If multiple time units are specified, they will be added together to determine time limit. + Usage: + with timeout(seconds=5): + my_slow_function(...) + Args: + - seconds: The time limit, in seconds. + - minutes: The time limit, in minutes. + - hours: The time limit, in hours. + """ + + limit = seconds + 60 * minutes + 3600 * hours + + def handler(signum, frame): + raise TimeoutError("timed out after {} seconds".format(limit)) + + try: + signal.signal(signal.SIGALRM, handler) + signal.alarm(limit) + + yield + finally: + signal.alarm(0) + + +@contextmanager +def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0): + with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: + try: + yield [t] + finally: + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info("deleted endpoint {}".format(endpoint_name)) + except ClientError as ce: + if ce.response["Error"]["Code"] == "ValidationException": + # avoids the inner exception to be overwritten + pass diff --git a/test/sagemaker_tests/huggingface/hf-vllm/requirements.txt b/test/sagemaker_tests/huggingface/hf-vllm/requirements.txt new file mode 100644 index 000000000000..33b7528fa51f --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/requirements.txt @@ -0,0 +1,29 @@ +boto3 +coverage +# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) +docker<=6.1.3 +docker-compose +flake8==3.7.7 +Flask==1.1.1 +mock +pytest==8.3.5 +pytest-cov +pytest-rerunfailures +pytest-xdist +PyYAML +protobuf>=3.20,<=3.20.2 +sagemaker>=2,<3 +six +requests<2.32.0 +requests_mock +Pillow +retrying==1.3.3 +urllib3==1.26.0 +pluggy>=1.5,<2 +requests_mock +sagemaker-inference +tenacity +fabric +invoke +gitpython +toml diff --git a/test/sagemaker_tests/huggingface/hf-vllm/utils/__init__.py b/test/sagemaker_tests/huggingface/hf-vllm/utils/__init__.py new file mode 100644 index 000000000000..5395887f0378 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/utils/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import boto3 +import botocore + + +def _botocore_resolver(): + """ + Get the DNS suffix for the given region. + :return: endpoint object + """ + loader = botocore.loaders.create_loader() + return botocore.regions.EndpointResolver(loader.load_data("endpoints")) + + +def get_ecr_registry(account, region): + """ + Get prefix of ECR image URI + :param account: Account ID + :param region: region where ECR repo exists + :return: AWS ECR registry + """ + endpoint_data = _botocore_resolver().construct_endpoint("ecr", region) + return "{}.dkr.{}".format(account, endpoint_data["hostname"]) diff --git a/test/sagemaker_tests/huggingface/hf-vllm/utils/image_utils.py b/test/sagemaker_tests/huggingface/hf-vllm/utils/image_utils.py new file mode 100644 index 000000000000..26103ec508c2 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/utils/image_utils.py @@ -0,0 +1,67 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import sys + +CYAN_COLOR = "\033[36m" +END_COLOR = "\033[0m" + + +def build_base_image( + framework_name, framework_version, py_version, processor, base_image_tag, cwd="." +): + base_image_uri = get_base_image_uri(framework_name, base_image_tag) + + dockerfile_location = os.path.join( + "docker", framework_version, "base", "Dockerfile.{}".format(processor) + ) + + subprocess.check_call( + [ + "docker", + "build", + "-t", + base_image_uri, + "-f", + dockerfile_location, + "--build-arg", + "py_version={}".format(py_version[-1]), + cwd, + ], + cwd=cwd, + ) + print("created image {}".format(base_image_uri)) + return base_image_uri + + +def get_base_image_uri(framework_name, base_image_tag): + return "{}-base:{}".format(framework_name, base_image_tag) + + +def get_image_uri(framework_name, tag): + return "{}:{}".format(framework_name, tag) + + +def _check_call(cmd, *popenargs, **kwargs): + if isinstance(cmd, str): + cmd = cmd.split(" ") + _print_cmd(cmd) + subprocess.check_call(cmd, *popenargs, **kwargs) + + +def _print_cmd(cmd): + print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) + sys.stdout.flush() diff --git a/test/sagemaker_tests/huggingface/hf-vllm/utils/local_mode_utils.py b/test/sagemaker_tests/huggingface/hf-vllm/utils/local_mode_utils.py new file mode 100644 index 000000000000..fa6b3cf00c36 --- /dev/null +++ b/test/sagemaker_tests/huggingface/hf-vllm/utils/local_mode_utils.py @@ -0,0 +1,46 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from contextlib import contextmanager +import fcntl +import os +import tarfile +import time + +from ..integration import resources_path + +LOCK_PATH = os.path.join(resources_path, "local_mode_lock") + + +@contextmanager +def lock(): + # Since Local Mode uses the same port for serving, we need a lock in order + # to allow concurrent test execution. + local_mode_lock_fd = open(LOCK_PATH, "w") + local_mode_lock = local_mode_lock_fd.fileno() + + fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) + + try: + yield + finally: + time.sleep(5) + fcntl.lockf(local_mode_lock, fcntl.LOCK_UN) + + +def assert_files_exist(output_path, directory_file_map): + for directory, files in directory_file_map.items(): + with tarfile.open(os.path.join(output_path, "{}.tar.gz".format(directory))) as tar: + for f in files: + tar.getmember(f) diff --git a/test/sagemaker_tests/huggingface/inference/resources/local_mode_lock b/test/sagemaker_tests/huggingface/inference/resources/local_mode_lock new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/sagemaker_tests/huggingface/vllm/integration/__init__.py b/test/sagemaker_tests/huggingface/vllm/integration/__init__.py new file mode 100644 index 000000000000..f9d5c7746fcb --- /dev/null +++ b/test/sagemaker_tests/huggingface/vllm/integration/__init__.py @@ -0,0 +1,62 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import re + +import boto3 + + +ROLE = "SageMakerRole" +DEFAULT_TIMEOUT = 45 + + +class NoLogStreamFoundError(Exception): + pass + + +class SageMakerEndpointFailure(Exception): + pass + + +def dump_logs_from_cloudwatch(e, region="us-west-2"): + """ + Function to dump logs from cloudwatch during error handling + """ + error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):") + endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)") + endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search( + str(e) + ) + if endpoint_match: + logs_client = boto3.client("logs", region_name=region) + endpoint = endpoint_match.group(1) + log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}" + log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name) + all_traffic_log_stream = "" + for log_stream in log_stream_resp.get("logStreams", []): + log_stream_name = log_stream.get("logStreamName") + if log_stream_name.startswith("AllTraffic"): + all_traffic_log_stream = log_stream_name + break + if not all_traffic_log_stream: + raise NoLogStreamFoundError( + f"Cannot find all traffic log streams for endpoint {endpoint}" + ) from e + events = logs_client.get_log_events( + logGroupName=log_group_name, logStreamName=all_traffic_log_stream + ) + raise SageMakerEndpointFailure( + f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}" + ) from e