-
Notifications
You must be signed in to change notification settings - Fork 81
Initial integration with AutoDist #72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
a27ac73
17f8f49
95c77f8
5f68304
dbb39cc
5dcff5b
c220221
f146516
c012cde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| # Copyright 2020 Petuum, Inc. All Rights Reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| FROM python:3.6.12-buster | ||
| WORKDIR /root | ||
|
|
||
| FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime | ||
|
|
||
| FROM tensorflow/tensorflow:2.2.0-gpu | ||
| # Install apps | ||
| COPY adaptdl adaptdl | ||
| COPY examples/requirements.txt . | ||
|
|
||
| RUN cd adaptdl && python3 setup.py bdist_wheel | ||
|
|
||
| ARG ADAPTDL_VERSION=0.0.0 | ||
| RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl | ||
| RUN pip install -r requirements.txt | ||
|
|
||
| RUN rm -rf adaptdl/dist | ||
|
|
||
| # autodist env | ||
| SHELL ["/bin/bash", "-cu"] | ||
|
|
||
| RUN rm -rf /etc/bash.bashrc | ||
|
|
||
| RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ | ||
| build-essential \ | ||
| git \ | ||
| curl \ | ||
| vim \ | ||
| wget \ | ||
| unzip | ||
| WORKDIR /root | ||
| RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ | ||
| python get-pip.py && \ | ||
| rm get-pip.py | ||
|
|
||
| COPY bert_config.json bert_config.json | ||
| COPY tf_examples.tfrecord tf_examples.tfrecord | ||
| RUN git clone https://github.com/petuum/autodist.git | ||
| WORKDIR autodist | ||
| RUN git checkout integration | ||
| RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip | ||
| RUN unzip protoc-3.11.0-linux-x86_64.zip | ||
| RUN PROTOC=$(pwd)/bin/protoc python setup.py build | ||
| RUN pip install -e .[dev] | ||
| RUN pip install tensorflow_hub | ||
| WORKDIR autodist | ||
|
|
||
| # setup ssh | ||
| RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ | ||
| mkdir -p /var/run/sshd | ||
|
|
||
| WORKDIR /root | ||
| RUN mkdir /root/.ssh | ||
| RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys | ||
| RUN chown -R root /root/.ssh | ||
| RUN chmod 700 /root/.ssh | ||
| RUN chmod 600 /root/.ssh/authorized_keys | ||
|
|
||
| RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config | ||
| RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config | ||
|
|
||
| # Allow OpenSSH to talk to containers without asking for confirmation | ||
| RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ | ||
| echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ | ||
| mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config | ||
|
|
||
| ENV PYTHONUNBUFFERED=true |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| apiVersion: adaptdl.petuum.com/v1 | ||
| kind: AdaptDLJob | ||
| metadata: | ||
| generateName: integration- | ||
| spec: | ||
| minReplicas: 2 | ||
| template: | ||
| spec: | ||
| containers: | ||
| - name: main | ||
| command: | ||
| - python3 | ||
| - /root/autodist/examples/benchmark/bert_with_adaptdl.py | ||
| - -input_files=/root/tf_examples.tfrecord | ||
| - --bert_config_file=/root/bert_config.json | ||
| - --num_train_epochs=1 | ||
| - --num_steps_per_epoch=1000 | ||
| - --learning_rate=5e-5 | ||
| - --steps_per_loop=1 | ||
| - --autodist_strategy=PS | ||
| env: | ||
| - name: ADAPTDL | ||
| value: "true" | ||
| resources: | ||
| limits: | ||
| nvidia.com/gpu: 1 | ||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| { | ||
| "attention_probs_dropout_prob": 0.1, | ||
| "hidden_act": "gelu", | ||
| "hidden_dropout_prob": 0.1, | ||
| "hidden_size": 1024, | ||
| "initializer_range": 0.02, | ||
| "intermediate_size": 4096, | ||
| "max_position_embeddings": 512, | ||
| "num_attention_heads": 16, | ||
| "num_hidden_layers": 24, | ||
| "type_vocab_size": 2, | ||
| "vocab_size": 30522 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| # Copyright 2020 Petuum, Inc. All Rights Reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| FROM python:3.6.12-buster | ||
| WORKDIR /root | ||
|
|
||
| FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime | ||
| WORKDIR /root | ||
DachengLi1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| FROM tensorflow/tensorflow:2.2.0-gpu | ||
|
|
||
| # Set default shell to /bin/bash | ||
| # SHELL ["/bin/bash", "-cu"] | ||
|
|
||
| # RUN rm -rf /etc/bash.bashrc | ||
|
|
||
| # Install apps | ||
| COPY adaptdl adaptdl | ||
| COPY examples/requirements.txt . | ||
|
|
||
| RUN cd adaptdl && python3 setup.py bdist_wheel | ||
|
|
||
| ARG ADAPTDL_VERSION=0.0.0 | ||
| RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl | ||
| RUN pip install -r requirements.txt | ||
|
|
||
| RUN rm -rf adaptdl/dist | ||
| WORKDIR /root | ||
| COPY examples examples_adaptdl | ||
| #COPY examples examples | ||
| #RUN apt-get update && apt-get install -y --no-install-recommends apt-utils | ||
|
|
||
| # autodist env | ||
| SHELL ["/bin/bash", "-cu"] | ||
|
|
||
| RUN rm -rf /etc/bash.bashrc | ||
|
|
||
| RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ | ||
| build-essential \ | ||
| git \ | ||
| curl \ | ||
| vim \ | ||
| wget \ | ||
| unzip | ||
|
|
||
| RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ | ||
| python get-pip.py && \ | ||
| rm get-pip.py | ||
|
|
||
| WORKDIR /root | ||
| COPY bert_config.json bert_config.json | ||
| COPY tf_examples.tfrecord tf_examples.tfrecord | ||
| COPY autodist autodist | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some of these COPY commands cannot work in a fresh clone of the AdaptDL repo? Can you make sure this example can work in that setting? Maybe git clone autodist instead of assuming it exists locally? |
||
| RUN cd autodist | ||
| RUN pip install tensorflow_hub | ||
| RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip | ||
| COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip | ||
| RUN unzip protoc-3.11.0-linux-x86_64.zip | ||
| RUN PROTOC=autodist/bin/protoc python autodist/setup.py build | ||
| WORKDIR autodist | ||
| RUN rm ./examples/resource_spec.yml | ||
| RUN pip install -e .[dev] | ||
|
|
||
| # setup ssh | ||
| # Install OpenSSH to communicate between containers | ||
| RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ | ||
| mkdir -p /var/run/sshd | ||
|
|
||
| WORKDIR /root | ||
| RUN mkdir /root/.ssh | ||
| RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys | ||
| RUN chown -R root /root/.ssh | ||
| RUN chmod 700 /root/.ssh | ||
| RUN chmod 600 /root/.ssh/authorized_keys | ||
|
|
||
| RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config | ||
| RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config | ||
|
|
||
| # Allow OpenSSH to talk to containers without asking for confirmation | ||
| RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ | ||
| echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ | ||
| mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config | ||
|
|
||
| ENV PYTHONUNBUFFERED=true | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| apiVersion: adaptdl.petuum.com/v1 | ||
| kind: AdaptDLJob | ||
| metadata: | ||
| generateName: integration- | ||
| spec: | ||
| minReplicas: 2 | ||
| template: | ||
| spec: | ||
| containers: | ||
| - name: main | ||
| command: | ||
| - python3 | ||
| - /root/autodist/examples/benchmark/bert.py | ||
| - -input_files=/root/tf_examples.tfrecord | ||
| - --bert_config_file=/root/bert_config.json | ||
| - --num_train_epochs=1 | ||
| - --num_steps_per_epoch=1000 | ||
| - --learning_rate=5e-5 | ||
| - --steps_per_loop=1 | ||
| - --autodist_strategy=PS | ||
| resources: | ||
| limits: | ||
| nvidia.com/gpu: 1 | ||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| { | ||
| "attention_probs_dropout_prob": 0.1, | ||
| "hidden_act": "gelu", | ||
| "hidden_dropout_prob": 0.1, | ||
| "hidden_size": 1024, | ||
| "initializer_range": 0.02, | ||
| "intermediate_size": 4096, | ||
| "max_position_embeddings": 512, | ||
| "num_attention_heads": 16, | ||
| "num_hidden_layers": 24, | ||
| "type_vocab_size": 2, | ||
| "vocab_size": 30522 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,7 @@ async def _handle_discover(self, request): | |
| group = request.match_info["group"] | ||
| timeout = int(request.query.get("timeout", "30")) | ||
| pod_ip_list = None | ||
| pod_gpu_list = None | ||
| async with kubernetes.watch.Watch() as w: | ||
| stream = w.stream(self._core_api.list_namespaced_pod, namespace, | ||
| label_selector="adaptdl/job={}".format(name), | ||
|
|
@@ -62,6 +63,27 @@ async def _handle_discover(self, request): | |
| if pod_ip_list is None: | ||
| pod_ip_list = [None] * replicas | ||
| pod_ip_list[rank] = pod.status.pod_ip | ||
| try: | ||
| gpu_request = request.rel_url.query["gpu"] | ||
| except KeyError: | ||
| gpu_request = False | ||
| if gpu_request: | ||
| if pod_gpu_list is None: | ||
| pod_gpu_list = [None] * replicas | ||
| container = pod.spec.containers | ||
| assert len(container) == 1 | ||
| pod_gpu_list[rank] = \ | ||
| int(container[0].resources.requests[ | ||
| 'nvidia.com/gpu']) | ||
| if all(pod_gpu is not None | ||
| for pod_gpu in pod_gpu_list) and \ | ||
| all(pod_ip is not None | ||
| for pod_ip in pod_ip_list): | ||
| assert len(pod_ip_list) == len(pod_gpu_list) | ||
| return_list = [(pod_ip_list[i], pod_gpu_list[i]) | ||
| for i in range(len(pod_ip_list))] | ||
| LOG.info(return_list) | ||
| return web.json_response(return_list) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we unify the return values with L87? |
||
| if all(pod_ip is not None for pod_ip in pod_ip_list): | ||
| return web.json_response(pod_ip_list) | ||
| return web.json_response(status=408) # Timeout. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.