petuum · DachengLi1 · Nov 30, 2020 · Nov 30, 2020 · Nov 30, 2020 · Dec 7, 2020
diff --git a/examples/autodist/Dockerfile b/examples/autodist/Dockerfile
@@ -0,0 +1,82 @@
+# Copyright 2020 Petuum, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+FROM python:3.6.12-buster
+WORKDIR /root
+
+FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
+
+FROM tensorflow/tensorflow:2.2.0-gpu
+# Install apps
+COPY adaptdl adaptdl
+COPY examples/requirements.txt .
+
+RUN cd adaptdl && python3 setup.py bdist_wheel
+
+ARG ADAPTDL_VERSION=0.0.0
+RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl
+RUN pip install -r requirements.txt
+
+RUN rm -rf adaptdl/dist
+
+# autodist env
+SHELL ["/bin/bash", "-cu"]
+
+RUN rm -rf /etc/bash.bashrc
+
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        git \
+        curl \
+        vim \
+        wget \
+        unzip
+WORKDIR /root
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+COPY bert_config.json bert_config.json
+COPY tf_examples.tfrecord tf_examples.tfrecord
+RUN git clone https://github.com/petuum/autodist.git
+WORKDIR autodist
+RUN git checkout integration
+RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip
+RUN unzip protoc-3.11.0-linux-x86_64.zip
+RUN PROTOC=$(pwd)/bin/protoc python setup.py build
+RUN pip install -e .[dev]
+RUN pip install tensorflow_hub
+WORKDIR autodist
+
+# setup ssh
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd
+
+WORKDIR /root
+RUN mkdir /root/.ssh
+RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys
+RUN chown -R root /root/.ssh
+RUN chmod 700 /root/.ssh
+RUN chmod 600 /root/.ssh/authorized_keys
+
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+ENV PYTHONUNBUFFERED=true
diff --git a/examples/autodist/adaptdljob.yaml b/examples/autodist/adaptdljob.yaml
@@ -0,0 +1,28 @@
+apiVersion: adaptdl.petuum.com/v1
+kind: AdaptDLJob
+metadata:
+  generateName: integration-
+spec:
+  minReplicas: 2
+  template:
+    spec:
+      containers:
+      - name: main
+        command:
+        - python3
+        - /root/autodist/examples/benchmark/bert_with_adaptdl.py 
+        - -input_files=/root/tf_examples.tfrecord 
+        - --bert_config_file=/root/bert_config.json 
+        - --num_train_epochs=1 
+        - --num_steps_per_epoch=1000
+        - --learning_rate=5e-5 
+        - --steps_per_loop=1 
+        - --autodist_strategy=PS
+        env:
+        - name: ADAPTDL
+          value: "true"
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+
+
diff --git a/examples/autodist/bert_config.json b/examples/autodist/bert_config.json
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
diff --git a/examples/autodist/tf_examples.tfrecord b/examples/autodist/tf_examples.tfrecord
diff --git a/examples/integration/Dockerfile b/examples/integration/Dockerfile
@@ -0,0 +1,96 @@
+# Copyright 2020 Petuum, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+FROM python:3.6.12-buster
+WORKDIR /root
+
+FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
+WORKDIR /root
+
+FROM tensorflow/tensorflow:2.2.0-gpu
+
+# Set default shell to /bin/bash
+# SHELL ["/bin/bash", "-cu"]
+
+# RUN rm -rf /etc/bash.bashrc
+
+# Install apps
+COPY adaptdl adaptdl
+COPY examples/requirements.txt .
+
+RUN cd adaptdl && python3 setup.py bdist_wheel
+
+ARG ADAPTDL_VERSION=0.0.0
+RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl
+RUN pip install -r requirements.txt
+
+RUN rm -rf adaptdl/dist
+WORKDIR /root
+COPY examples examples_adaptdl
+#COPY examples examples
+#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils
+
+# autodist env
+SHELL ["/bin/bash", "-cu"]
+
+RUN rm -rf /etc/bash.bashrc
+
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        git \
+        curl \
+        vim \
+        wget \
+        unzip
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+WORKDIR /root
+COPY bert_config.json bert_config.json
+COPY tf_examples.tfrecord tf_examples.tfrecord
+COPY autodist autodist
+RUN cd autodist
+RUN pip install tensorflow_hub
+RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip
+COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip
+RUN unzip protoc-3.11.0-linux-x86_64.zip
+RUN PROTOC=autodist/bin/protoc python autodist/setup.py build
+WORKDIR autodist
+RUN rm ./examples/resource_spec.yml
+RUN pip install -e .[dev]
+
+# setup ssh
+# Install OpenSSH to communicate between containers
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd
+
+WORKDIR /root
+RUN mkdir /root/.ssh
+RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys
+RUN chown -R root /root/.ssh
+RUN chmod 700 /root/.ssh
+RUN chmod 600 /root/.ssh/authorized_keys
+
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+ENV PYTHONUNBUFFERED=true
diff --git a/examples/integration/adaptdljob.yaml b/examples/integration/adaptdljob.yaml
@@ -0,0 +1,25 @@
+apiVersion: adaptdl.petuum.com/v1
+kind: AdaptDLJob
+metadata:
+  generateName: integration-
+spec:
+  minReplicas: 2
+  template:
+    spec:
+      containers:
+      - name: main
+        command:
+        - python3
+        - /root/autodist/examples/benchmark/bert.py 
+        - -input_files=/root/tf_examples.tfrecord 
+        - --bert_config_file=/root/bert_config.json 
+        - --num_train_epochs=1 
+        - --num_steps_per_epoch=1000
+        - --learning_rate=5e-5 
+        - --steps_per_loop=1 
+        - --autodist_strategy=PS
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+
+
diff --git a/examples/integration/bert_config.json b/examples/integration/bert_config.json
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
diff --git a/examples/integration/tf_examples.tfrecord b/examples/integration/tf_examples.tfrecord
diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py
@@ -49,6 +49,7 @@ async def _handle_discover(self, request):
         group = request.match_info["group"]
         timeout = int(request.query.get("timeout", "30"))
         pod_ip_list = None
+        pod_gpu_list = None
         async with kubernetes.watch.Watch() as w:
             stream = w.stream(self._core_api.list_namespaced_pod, namespace,
                               label_selector="adaptdl/job={}".format(name),
@@ -62,6 +63,27 @@ async def _handle_discover(self, request):
                     if pod_ip_list is None:
                         pod_ip_list = [None] * replicas
                     pod_ip_list[rank] = pod.status.pod_ip
+                    try:
+                        gpu_request = request.rel_url.query["gpu"]
+                    except KeyError:
+                        gpu_request = False
+                    if gpu_request:
+                        if pod_gpu_list is None:
+                            pod_gpu_list = [None] * replicas
+                        container = pod.spec.containers
+                        assert len(container) == 1
+                        pod_gpu_list[rank] = \
+                            int(container[0].resources.requests[
+                                'nvidia.com/gpu'])
+                        if all(pod_gpu is not None
+                                for pod_gpu in pod_gpu_list) and \
+                                all(pod_ip is not None
+                                    for pod_ip in pod_ip_list):
+                            assert len(pod_ip_list) == len(pod_gpu_list)
+                            return_list = [(pod_ip_list[i], pod_gpu_list[i])
+                                           for i in range(len(pod_ip_list))]
+                            LOG.info(return_list)
+                            return web.json_response(return_list)
                     if all(pod_ip is not None for pod_ip in pod_ip_list):
                         return web.json_response(pod_ip_list)
         return web.json_response(status=408)  # Timeout.