diff --git a/ci/buildspec.yml b/ci/buildspec.yml new file mode 100644 index 00000000..a7d9afab --- /dev/null +++ b/ci/buildspec.yml @@ -0,0 +1,61 @@ +version: 0.2 + +phases: + install: + runtime-versions: + python: 3.8 + docker: 19 + pre_build: + commands: + - echo Pre-build started on `date` + - echo Installing dependencies... + - curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + - bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3 + - export PATH=/miniconda3/bin:${PATH} + - conda install python=3.8 + - conda update -y conda + - python3 -m pip install pip==20.1 # The new pip denpendency resolver in 20.2+ can't resolve 1.0-1 and 0.90 dependencies + - python3 -m pip install .[test] + build: + commands: + - echo Build started on `date` + - echo Docker login... + - docker login -u $dockerhub_username -p $dockerhub_password + - echo Building the Docker image... + - docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu . + - python3 setup.py bdist_wheel --universal + - docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu . + - echo Running tox... + - printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + - docker build -t test-xgboost-container -f Dockerfile.test . + - docker run --rm -t test-xgboost-container sh -c 'pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' + - docker run --rm -t test-xgboost-container sh -c 'flake8 setup.py src test' + - echo Running container tests... + - pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION + - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 + - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION + post_build: + commands: + - echo Build completed on `date` + - | + case $CODEBUILD_WEBHOOK_EVENT in + PULL_REQUEST_MERGED) + echo Logging in to Amazon ECR... + $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) + echo Pushing the Docker image... + docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + ;; + PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED) + echo Logging in to Amazon ECR... + $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) + echo Pushing the Docker image... + # pushes test tag for manual verification, requires cleanup in ECR every once in a while though + TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test + docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG} + docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + ;; + *) + echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT + ;; + esac \ No newline at end of file diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu index 9cd923ac..fc609259 100644 --- a/docker/1.0-1/base/Dockerfile.cpu +++ b/docker/1.0-1/base/Dockerfile.cpu @@ -1,35 +1,21 @@ -FROM ubuntu:16.04 +ARG UBUNTU_VERSION=18.04 +ARG CUDA_VERSION=10.2 +ARG IMAGE_DIGEST=218afa9c2002be9c4629406c07ae4daaf72a3d65eb3c5a5614d9d7110840a46e -# Install python and other runtime dependencies -RUN apt-get update && \ - apt-get -y install \ - build-essential \ - libatlas-dev \ - git \ - wget \ - curl \ - nginx \ - jq - -RUN apt-get update -RUN apt-get clean +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}@sha256:${IMAGE_DIGEST} -RUN apt-get -y install openjdk-8-jdk-headless +ARG MINICONDA_VERSION=4.9.2 +ARG CONDA_PY_VERSION=39 +ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36" +ARG CONDA_PKG_VERSION=4.10.1 +ARG PYTHON_VERSION=3.8.13 +ARG PYARROW_VERSION=1.0 +ARG MLIO_VERSION=0.7.0 +ARG XGBOOST_VERSION=1.0 -# Install mlio -RUN echo 'installing miniconda' && \ - curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py38_4.8.3-Linux-x86_64.sh && \ - echo "d63adf39f2c220950a063e0529d4ff74 Miniconda3-py38_4.8.3-Linux-x86_64.sh" | md5sum -c - && \ - bash Miniconda3-py38_4.8.3-Linux-x86_64.sh -bfp /miniconda3 && \ - rm Miniconda3-py38_4.8.3-Linux-x86_64.sh - -ENV PATH=/miniconda3/bin:${PATH} - -RUN conda install -c conda-forge python=3.6.13 && \ - conda update -y conda && \ - conda install pip=20.1 && \ - conda install -c conda-forge pyarrow=0.14.1 && \ - conda install -c mlio -c conda-forge mlio-py=0.1 +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging @@ -37,5 +23,101 @@ ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING='utf-8' +RUN rm /etc/apt/sources.list.d/cuda.list && \ + rm /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-key del 7fa2af80 && \ + apt-get update && apt-get install -y --no-install-recommends wget && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install --no-install-recommends \ + build-essential \ + curl \ + git \ + jq \ + libatlas-base-dev \ + nginx \ + openjdk-8-jdk-headless \ + unzip \ + wget \ + && \ + # MLIO build dependencies + # Official Ubuntu APT repositories do not contain an up-to-date version of CMake required to build MLIO. + # Kitware contains the latest version of CMake. + apt-get -y install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + software-properties-common \ + && \ + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ + gpg --dearmor - | \ + tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ bionic main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ + apt-get update && \ + rm /usr/share/keyrings/kitware-archive-keyring.gpg && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + cmake=3.18.4-0kitware1 \ + cmake-data=3.18.4-0kitware1 \ + doxygen \ + kitware-archive-keyring \ + libcurl4-openssl-dev \ + libssl-dev \ + libtool \ + ninja-build \ + python3-dev \ + python3-distutils \ + python3-pip \ + zlib1g-dev \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install conda +RUN cd /tmp && \ + curl -L --output /tmp/Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-py${CONDA_PY_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh && \ + echo "${CONDA_CHECKSUM} /tmp/Miniconda3.sh" | md5sum -c - && \ + bash /tmp/Miniconda3.sh -bfp /miniconda3 && \ + rm /tmp/Miniconda3.sh + +ENV PATH=/miniconda3/bin:${PATH} + +# Install MLIO with Apache Arrow integration +# We could install mlio-py from conda, but it comes with extra support such as image reader that increases image size +# which increases training time. We build from source to minimize the image size. +RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ + # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html + conda config --system --set auto_update_conda false && \ + conda config --system --set show_channel_urls true && \ + echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \ + conda install -c conda-forge python=${PYTHON_VERSION} && \ + conda install conda=${CONDA_PKG_VERSION} && \ + conda update -y conda && \ + conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \ + cd /tmp && \ + git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ + cd mlio && \ + build-tools/build-dependency build/third-party all && \ + mkdir -p build/release && \ + cd build/release && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \ + cmake --build . && \ + cmake --build . --target install && \ + cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \ + -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \ + cmake --build . --target mlio-py && \ + cmake --build . --target mlio-arrow && \ + cd ../../src/mlio-py && \ + python3 setup.py bdist_wheel && \ + python3 -m pip install typing && \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install dist/*.whl && \ + cp -r /tmp/mlio/build/third-party/lib/intel64/gcc4.7/* /usr/local/lib/ && \ + ldconfig && \ + rm -rf /tmp/mlio + # Install latest version of XGBoost -RUN python3 -m pip install --no-cache -I xgboost==1.0 +RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} \ No newline at end of file diff --git a/docker/1.0-1/final/Dockerfile.cpu b/docker/1.0-1/final/Dockerfile.cpu index ea471596..2b9732ff 100644 --- a/docker/1.0-1/final/Dockerfile.cpu +++ b/docker/1.0-1/final/Dockerfile.cpu @@ -1,5 +1,9 @@ -FROM xgboost-container-base:1.0-1-cpu-py3 -ENV SAGEMAKER_XGBOOST_VERSION 1.0-1 +ARG SAGEMAKER_XGBOOST_VERSION=1.0-1 +ARG PYTHON_VERSION=3.8 + +FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 + +ARG SAGEMAKER_XGBOOST_VERSION ######################## # Install dependencies # @@ -11,9 +15,9 @@ RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt # Copy wheel to container # ########################### COPY dist/sagemaker_xgboost_container-2.0-py2.py3-none-any.whl /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl -# https://github.com/googleapis/google-cloud-python/issues/6647 -RUN rm -rf /miniconda3/lib/python3.6/site-packages/numpy-1.19.5.dist-info && \ +RUN rm -rf /miniconda3/lib/python3.8/site-packages/numpy-1.21.2.dist-info && \ python3 -m pip install --no-cache /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl && \ + python3 -m pip uninstall -y typing && \ rm /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl ############## @@ -21,10 +25,10 @@ RUN rm -rf /miniconda3/lib/python3.6/site-packages/numpy-1.19.5.dist-info && \ ############## # TODO: remove after making contributions back to xgboost for tracker.py COPY src/sagemaker_xgboost_container/dmlc_patch/tracker.py \ - /miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py + /miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py # Include DMLC python code in PYTHONPATH to use RabitTracker -ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker +ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker ####### # MMS # @@ -34,12 +38,12 @@ RUN useradd -m model-server RUN mkdir -p /home/model-server/tmp && chown -R model-server /home/model-server # Copy MMS configs -COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/config.properties.tmp /home/model-server +COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/config.properties.tmp /home/model-server ENV XGBOOST_MMS_CONFIG=/home/model-server/config.properties # Copy execution parameters endpoint plugin for MMS RUN mkdir -p /tmp/plugins -COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/endpoints-1.0.jar /tmp/plugins +COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/endpoints-1.0.jar /tmp/plugins RUN chmod +x /tmp/plugins/endpoints-1.0.jar # Create directory for models @@ -67,4 +71,4 @@ ENV SAGEMAKER_SERVING_MODULE sagemaker_xgboost_container.serving:main EXPOSE 8080 ENV TEMP=/home/model-server/tmp -LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9529a1da..0053e600 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,27 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 PyYAML==5.4.1 -boto3==1.10.14 -botocore==1.13.14 -gunicorn<20.0.0 -cryptography==3.4.6 -matplotlib==3.3.2 -multi-model-server==1.1.1 +Pillow==9.1.0 +boto3==1.17.52 +botocore==1.20.52 +cryptography==35.0.0 +gunicorn==19.10.0 +itsdangerous==2.0.1 +matplotlib==3.4.1 +multi-model-server==1.1.2 numpy==1.19.2 -pandas==1.1.3 +pandas==1.2.4 +protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 -python-dateutil==2.8.0 -requests<2.21 +python-dateutil==2.8.1 +requests==2.25.1 retrying==1.3.3 -sagemaker-containers>=2.8.3,<2.9 -sagemaker-inference==1.2.0 -scikit-learn==0.23.2 -scipy==1.2.2 -smdebug==0.4.13 -urllib3==1.25.9 -wheel +sagemaker-containers==2.8.6.post2 +sagemaker-inference==1.5.5 +scikit-learn==0.24.1 +scipy==1.6.2 +smdebug==1.0.10 +urllib3==1.26.5 +wheel==0.36.2 +jinja2==2.11.3 +MarkupSafe==1.1.1 +Werkzeug==0.15.6 diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py index d17efdae..f42344ee 100644 --- a/src/sagemaker_xgboost_container/data_utils.py +++ b/src/sagemaker_xgboost_container/data_utils.py @@ -442,40 +442,34 @@ def get_parquet_dmatrix(path, is_pipe=False): def get_recordio_protobuf_dmatrix(path, is_pipe=False): """Get Data Matrix from recordio-protobuf data. - :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ try: if is_pipe: - dataset = [mlio.SageMakerPipe(path)] - reader = mlio.RecordIOProtobufReader(dataset=dataset, - batch_size=BATCH_SIZE) + pipes_path = path if isinstance(path, list) else [path] + dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path] else: dataset = mlio.list_files(path) - reader = mlio.RecordIOProtobufReader(dataset=dataset, - batch_size=BATCH_SIZE) + + reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) + reader = mlio.RecordIOProtobufReader(reader_params) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) - if type(reader.peek_example()['values']) is mlio.core.DenseTensor: - to_matrix = as_numpy - vstack = np.vstack - else: - to_matrix = to_coo_matrix - vstack = scipy_vstack + is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor all_features = [] all_labels = [] for example in reader: - features = to_matrix(example['values']) + features = as_numpy(example['values']) if is_dense_tensor else to_coo_matrix(example['values']) all_features.append(features) labels = as_numpy(example['label_values']) all_labels.append(labels) - all_features = vstack(all_features) + all_features = np.vstack(all_features) if is_dense_tensor else scipy_vstack(all_features).tocsr() all_labels = np.concatenate(all_labels, axis=None) dmatrix = xgb.DMatrix(all_features, label=all_labels) return dmatrix diff --git a/src/sagemaker_xgboost_container/encoder.py b/src/sagemaker_xgboost_container/encoder.py index 814bbac1..f0f15f67 100644 --- a/src/sagemaker_xgboost_container/encoder.py +++ b/src/sagemaker_xgboost_container/encoder.py @@ -18,7 +18,7 @@ import logging import os import tempfile -from typing import Iterable +from typing import Iterable, Union import mlio from mlio.integ.numpy import as_numpy @@ -35,21 +35,23 @@ def _clean_csv_string(csv_string, delimiter): return ['nan' if x == '' else x for x in csv_string.split(delimiter)] -def csv_to_dmatrix(string_like, dtype=None): # type: (str) -> xgb.DMatrix +def csv_to_dmatrix(input: Union[str, bytes], dtype=None) -> xgb.DMatrix: """Convert a CSV object to a DMatrix object. Args: - string_like (str): CSV string. Assumes the string has been stripped of leading or trailing newline chars. + input (str/binary): CSV string or binary object(encoded by UTF-8). + Assumes the string has been stripped of leading or trailing newline chars. dtype (dtype, optional): Data type of the resulting array. If None, the dtypes will be determined by the contents of each column, individually. This argument can only be used to 'upcast' the array. For downcasting, use the .astype(t) method. Returns: (xgb.DMatrix): XGBoost DataMatrix """ - sniff_delimiter = csv.Sniffer().sniff(string_like.split('\n')[0][:512]).delimiter + csv_string = input.decode() if isinstance(input, bytes) else input + sniff_delimiter = csv.Sniffer().sniff(csv_string.split('\n')[0][:512]).delimiter delimiter = ',' if sniff_delimiter.isalnum() else sniff_delimiter logging.info("Determined delimiter of CSV input is \'{}\'".format(delimiter)) - np_payload = np.array(list(map(lambda x: _clean_csv_string(x, delimiter), string_like.split('\n')))).astype(dtype) + np_payload = np.array(list(map(lambda x: _clean_csv_string(x, delimiter), csv_string.split('\n')))).astype(dtype) return xgb.DMatrix(np_payload) @@ -83,21 +85,18 @@ def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix """ buf = bytes(string_like) dataset = [mlio.InMemoryStore(buf)] - reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100) + reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100) + reader = mlio.RecordIOProtobufReader(reader_params) - if type(reader.peek_example()['values']) is mlio.core.DenseTensor: - to_matrix = as_numpy - vstack = np.vstack - else: - to_matrix = to_coo_matrix - vstack = scipy_vstack + is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor examples = [] for example in reader: - tmp = to_matrix(example['values']) # Ignore labels if present - examples.append(tmp) + # Ignore labels if present + values = as_numpy(example['values']) if is_dense_tensor else to_coo_matrix(example['values']) + examples.append(values) - data = vstack(examples) + data = np.vstack(examples) if is_dense_tensor else scipy_vstack(examples).tocsr() dmatrix = xgb.DMatrix(data) return dmatrix diff --git a/test-requirements.txt b/test-requirements.txt index 2a0248cb..0108ed40 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,3 +1,4 @@ +Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 coverage docker-compose flake8 diff --git a/tox.ini b/tox.ini index 70c3b119..6e7f4a66 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {py36}-xgboost{1.0},flake8 +envlist = {py38}-xgboost{1.0},flake8 [flake8] max-line-length = 120 @@ -11,19 +11,22 @@ deps = xgboost0.82: xgboost==0.82 xgboost0.90: xgboost==0.90 xgboost1.0: xgboost==1.0 + xgboost1.2: xgboost==1.2 + xgboost1.3: xgboost==1.3.3 + xgboost1.5: xgboost==1.5.2 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt conda_deps= - pyarrow=0.14.1 - mlio-py=0.1 + pyarrow==1.0.1 + tbb==2020.2 + mlio-py==0.7.0 conda_channels= conda-forge mlio commands = pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit # increase minimum bar over time (75%+) -install_command = python3 -m pip install {opts} {packages} --use-deprecated=legacy-resolver [testenv:flake8] deps = flake8 -commands = flake8 setup.py src test +commands = flake8 setup.py src test \ No newline at end of file