From a27ac73c1158d9d3262b9fcf9b05d200c5efb015 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 30 Nov 2020 00:24:14 +0000 Subject: [PATCH 1/9] integration --- adaptdl/adaptdl/torch/__init__.py | 42 +++++++++- examples/integration/Dockerfile | 96 ++++++++++++++++++++++ examples/integration/adaptdljob.yaml | 25 ++++++ examples/integration/bert_config.json | 13 +++ examples/integration/tf_examples.tfrecord | Bin 0 -> 49090 bytes sched/adaptdl_sched/supervisor.py | 41 +++++++++ 6 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 examples/integration/Dockerfile create mode 100644 examples/integration/adaptdljob.yaml create mode 100644 examples/integration/bert_config.json create mode 100644 examples/integration/tf_examples.tfrecord diff --git a/adaptdl/adaptdl/torch/__init__.py b/adaptdl/adaptdl/torch/__init__.py index d453d8ba..73d027e0 100644 --- a/adaptdl/adaptdl/torch/__init__.py +++ b/adaptdl/adaptdl/torch/__init__.py @@ -24,14 +24,14 @@ import portpicker import requests import torch.distributed - import adaptdl.collective import adaptdl.env from .epoch import current_epoch, finished_epochs, remaining_epochs_until from .data import current_dataloader, AdaptiveDataLoader, ElasticSampler from .parallel import AdaptiveDataParallel from .accumulator import Accumulator - +import os +import getpass logging.basicConfig(level=logging.INFO) LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) @@ -66,6 +66,44 @@ def init_process_group(backend): LOG.info("torch.distributed initialized") +def write_config(): + url = adaptdl.env.supervisor_url() + if url: + key = adaptdl.env.job_id() + group = adaptdl.env.num_restarts() + while True: + response = requests.get(url=f"{url}/discover_gpu/{key}/{group}") + if response.status_code != 408: # Timeout. + break + response.raise_for_status() + master_addr = response.json()[0][0] + else: + raise ValueError("supervisor url not found.") + # write to the share path + path = os.path.join(adaptdl.env.share_path(), "resource_spec.yml") + LOG.info(f"writing to {path}") + + f = open(path, "w") + f.write("nodes: \n") + num_nodes = len(response.json()) + for i in range(num_nodes): + f.write(f" - address: {response.json()[i][0]} \n") + f.write(f" gpus: {list(range(response.json()[i][1]))} \n") + if i == 0: # chief + master_addr = response.json()[i][0] + f.write(" chief: true \n") + else: + f.write(" ssh_config: conf \n") + f.write("ssh: \n") + f.write(" conf: \n") + f.write(f" username: '{getpass.getuser()}' \n") + f.write(" key_file: '/root/.ssh/id_rsa' \n") + f.close() + # Initialize collective module. + master_port = adaptdl.env.master_port() + adaptdl.collective.initialize(master_addr, master_port) + + __all__ = [ "init_process_group", "current_epoch", diff --git a/examples/integration/Dockerfile b/examples/integration/Dockerfile new file mode 100644 index 00000000..ab4d4321 --- /dev/null +++ b/examples/integration/Dockerfile @@ -0,0 +1,96 @@ +# Copyright 2020 Petuum, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM python:3.6.12-buster +WORKDIR /root + +FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime +WORKDIR /root + +FROM tensorflow/tensorflow:2.2.0-gpu + +# Set default shell to /bin/bash +# SHELL ["/bin/bash", "-cu"] + +# RUN rm -rf /etc/bash.bashrc + +# Install apps +COPY adaptdl adaptdl +COPY examples/requirements.txt . + +RUN cd adaptdl && python3 setup.py bdist_wheel + +ARG ADAPTDL_VERSION=0.0.0 +RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl +RUN pip install -r requirements.txt + +RUN rm -rf adaptdl/dist +WORKDIR /root +COPY examples examples_adaptdl +#COPY examples examples +#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils + +# autodist env +SHELL ["/bin/bash", "-cu"] + +RUN rm -rf /etc/bash.bashrc + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + wget \ + unzip + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +WORKDIR /root +COPY bert_config.json bert_config.json +COPY tf_examples.tfrecord tf_examples.tfrecord +COPY autodist autodist +RUN cd autodist +RUN pip install tensorflow_hub +RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip +COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip +RUN unzip protoc-3.11.0-linux-x86_64.zip +RUN PROTOC=autodist/bin/protoc python autodist/setup.py build +WORKDIR autodist +RUN rm ./examples/resource_spec.yml +RUN pip install -e .[dev] + +# setup ssh +# Install OpenSSH to communicate between containers +RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd + +WORKDIR /root +RUN mkdir /root/.ssh +RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys +RUN chown -R root /root/.ssh +RUN chmod 700 /root/.ssh +RUN chmod 600 /root/.ssh/authorized_keys + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ENV PYTHONUNBUFFERED=true diff --git a/examples/integration/adaptdljob.yaml b/examples/integration/adaptdljob.yaml new file mode 100644 index 00000000..ed364fa9 --- /dev/null +++ b/examples/integration/adaptdljob.yaml @@ -0,0 +1,25 @@ +apiVersion: adaptdl.petuum.com/v1 +kind: AdaptDLJob +metadata: + generateName: integration- +spec: + minReplicas: 2 + template: + spec: + containers: + - name: main + command: + - python3 + - /root/autodist/examples/benchmark/bert.py + - -input_files=/root/tf_examples.tfrecord + - --bert_config_file=/root/bert_config.json + - --num_train_epochs=1 + - --num_steps_per_epoch=1000 + - --learning_rate=5e-5 + - --steps_per_loop=1 + - --autodist_strategy=PS + resources: + limits: + nvidia.com/gpu: 1 + + diff --git a/examples/integration/bert_config.json b/examples/integration/bert_config.json new file mode 100644 index 00000000..a7efa973 --- /dev/null +++ b/examples/integration/bert_config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "type_vocab_size": 2, + "vocab_size": 30522 +} diff --git a/examples/integration/tf_examples.tfrecord b/examples/integration/tf_examples.tfrecord new file mode 100644 index 0000000000000000000000000000000000000000..49bad5ad29814410b34d075ea394f9f714d16691 GIT binary patch literal 49090 zcmeHw31HN8_J6-snlT*T%yfDMN?S@PwA|%d3d$jZAeRD)3V5|&5n(}ab=5@*SfE%r zE3(3Zt%yPw1qB5Q2nZ`|TUw45N(+_4a#dhDh;shkH%BMQWYU>l3;zG?(McwgWHRr4 z-}QN)*Wo@d;onQ)pI>Fy`>@^F`s1IUJTC5~F|UpvGxnv)mX>@J+dQvfCSK5VKhDI* za276(%EZf}v+xm|Y|5R6z8{*6ui_lMf%9U=vS!D}--sJaUY;`9GR*QgJH*Rt+C$a% zKQAx#L&Vmd97o2-O_(CwXd>cgAa)uOzfX1kKl11wJHwlOVC;m6uZy>J74bI2UP5t) z@dw-pev%mTtips*rqHIhiP7fw!%%SisOh|(1E3OdU z{@6?p%8J{Mk66fJZuV}JiT7CNceOETa5uUB@%zF`@E6=Ad>sy36PRX6H!ly}=)WJ6 z#343j8C>S-yI0`$=7hoBX5l+<7uop4+owaxC5~j`w3&$fiuYN|n7k)e_c`;{b?aiz z&cw4Rh-Jm6Kn21g|2iH$${IMB4Q8zq;@+GxW-`c4+=Lh7#=QFc3vsVbw$$P8V-aH8 zu)Yofp7__vW2cP$>x9XcK;ECV)?w<^Zx9&TIksEh(d6Z^<0roMx)*vNZTWd`QQE$G z)aMpkQm1l~>HJ-*Q-pb*F9NKpd#pebuviyQC;xYzgFVRYV z013&*#UV*Sq$~^DAaH|-5a^DWrOo6i)6{_M9r z%*M>)a=>P+^vephF|#@q4@(VLVVZAF@ZHerEc53u9?oj)(H?e|%6R;N_}>s)hT=#O zF2Ferzw1qMFqvyjHeCX3Jswto&-sDc(G~dsK8bIfK>z+4!7}{;mk^lFuNqm(tT5%^ zy-hOlPm7S90Y%590gJ=!46!qXQ^|s+nfO8w{d8WyKHkpkG_T7?8#rp`S9Q5!NsU;} z#S^$<#zY%)4xhq{!6@0nJ_|~HI}0y*rFfW)IU7-gcLmYQTzKX3rwGgx{Jvl%7L41= zm4bOowa(5*n}jpMUU+M}-@h8!nLECP_$b)JLOdGOUK@H|I(0=dAZdfSE9|dz*ch!EHQ__q#s~3^ z6!g!)6{eGxLU>SRjEzw!YL#6t#kneY>5+kRPy8#mqML8C`&0%l1M!;?y9veJ!X(+0 z-tP2=DDi#t@8;mzT*M%qyXT-B5cTQ9KpW!k_!3=-1Q&-2phY;IOKAP~LSOsAu*IZQEXP4gb6| zADodCz#h2c279uOp{4XO)o<23x|QM#5De&B%I`h}U-y{hW~*FWgtzxsf28&-rQ7;a zp{}UDLDe^^O~k5xE2=ZFKdAJ01LD6#Y&wc7#3|+r_#0Y;(+#U{ZLY z)GWtYqDq>QmbCuTg=P9Q*y5j}J>i z|5N1IaP~kQj~DRrYkB;_ZGnU{Yh}{oi+S<^_G{G-zY6iGh+To=a)U`p2BIN`{NsYp zLt_jg{m1WN%{wA9^WAk^&cgi^%FeC+jA>^~Y1$_~-Vy)6oeG z7qQ0>5>HOz`Ih;Oui-p6*M%U6nIcy%gG+DsyO0LyAbMAy|Lvbo6yXGJZ`ib7(8t{8 zt>}eU_qmM=+GgX6|3Z7)+8JePhZtmbq&yyxlRJM(@7t%IuU|mZh}P|5I&|&+V2`0M zPk77AMZPm17vc*kDA`mTxD1kDnGjpeF=yiC-2N~-^EG!8@?=LK5n7ue>j$=m)BhqV z(}E{wKf-Qum8?xR754aL8oFsdf>V4?;B0(;q>VXmF2d)row+Wg_^xm#-mcA?uQ9cE zUKBi2e0H`A5FsaoTnGQ=R6rTCxXJaSXv-8kQ;drn=fInpc=6tRGflVWtwP@cpw*QEyM!qz` zYe>X`PIV+EMM0^fnvy@gTAAZqtvs8!`c8S0OMzd{# zWgR~o{TF6CQCq2lcsIDbnDfBkbGvglh%8R;KSLl&5dt`6bJ4Jhf>Rsq(E>2 z>@m5)MX*wrf3C8Ds$RWKKB@lSc9xPi2S}>%kC9P3*q|{dh@W9 zzvrzeW{@2VkyOke(@e#mqq6$;`p(4+NTsXF8S6%6<22LeQH7A>q1gkvq;Fb;XpL=0 z%iV6-l zZoT@xu{|H|+yAir!`Gzk;&WMxkP9V=iNRNH%vH ze$=v5bOxS&ZF)ct-?xy{#zNJp*m}w?@2hOFymE@^(WhL6UD@~+o*9#e)7#|*CG~(@ z?cw$45|pb~#mo%7)N#%l2!-}oKY|GCUyU|!`S`+mMCO906i*VDBI*;B=70^O5fmY= z;-`~X6F-4o5;1xrV(dhmtKvj-wBV$2QC#6@!3k(xT;*sHb!D8Kg}TdGX3gM}c-Ode z@X#TaJcG~Se9-GZ0{@hX`#=yM92U{6OV@5a9v<}gDDu}yP%G4jnXE8n<9)Hormr~~ z_#co}T`3RbajD(J19oj$dpWT!&Aw|bEM$j2~cZ1r!{E~aCbhaVd> zc*IkqMcc=A(XJhld;}Z*kyS_sg-LHM0$;j(DWXDbhmvMqz;4;ur;Ts{*gHH7troc<0ivfOH%hrwV~TuiFKOX+uUDUEx1b-**gjXrz&m*=(S)PD3gogb(QbhLe0iO`+r#oPUeECt*=u!-FJYWeH#M;GO`Erj>E1)W zVQ|az3;m_cOu)zNi%l>sh|I=?Jr-FmD^ly#qJfTkFnU!v4RlY(jILYZdo?J76<*uu z0QR72W!R1Qorv9z;?y>TD72p_yjh~KAXNJ`76jbO0-S0lq5&b)e%y$m%gjj&SRdJf zltNUv(;sZe=6pnr$kx&-S2xE+#%}2(*^uE(Y=8D^ABeP-OW-xc+Yx&Oa7qqjgpldf z7+KedR5HIS5%H*5c;BSmZ=RsO9xPu%wO6&A(wLkYQ!+D#$l-zFfvVzzgdRi+4;(g` zT5w4GN-rYNR_h6~DXtYnJBG)zqf^ll=E!+c7bc7Wses~z(# zV@LcI#Qq8hX22a%M5!98NJOL}0p(WW6k$_^Qlt=61x_)+mjV?xv;u6G3{-@CoC2jt zWRPM!$-|I(qoDr0hSy*rrm%iEyVxlq1N*_p7jgZ^03)pg_Ge)St{*-n!}^H>)t}Hv zgX*V!7x7imZKfV)4hn7^)1hN*&)$y>e|f?bm%%uNdQy-lz&cX{{?$kXYaYdG%!h@f z=%-MnqOBmcquXMHkeGNh_901y23-B|H#Wl+Foe?DxoI-8AxH1 zD@+1zl1v|V(qd=6<$uQOxzhGmpfhMuL>9kg7$NVOAYliA^sn6U=xgxUPg@G`mSJKt zZ)YN+r#J-ZoNRpVpJ-bq%HjZ>xpKfUI9q%HzCIz)xQJRjUr+~ z9!~0%kB5EAbUewNNF2#{n)(%T@dQ%wHb#|;CztUxnh}O%JQTPTx%hiD3uhdDV7^gs zXp>f*`VJWW?C6)rPkL?2n_k5)6;6wm)ajk*DZj9v`_#Chb_j5dD1_tm%2ySthw@;r zyU6&d@9UN-xVVJ)i-Y@sYF7=P# z(cQ63OGJ5n=!^eCoRP?t;QXfK7f_W_$XaFMr2%A1|Gj*EXcpe;6!W#|8}_@$|FL!INdX zYV%+%_((S5XIQJCAp);3rT8x4wnZ%kC`PLJP6$OGQp~;3anrZ3Q3tIhsXfNAr#=GgFb`a4GDY(=rp-@LbS*; zp>`54R!7M0H;$UWA1Z@TjEW=BnMk4aVsV6AdPb~`P)|)RjsWn+&Lo?zC8GT_k($xO zM)Q))+8~gq0=#V#tnb^CP||Z-ny2$=e-+<6iGM$g)^V2s&UGC=OO~4cl#4s`B?dKa z+NwjxP7n7V`uM09$=H8*u|2fX-G*<)MpJ+Y)W@;_KnSowD6U9Qp7? zn0Y-RYO8TuAlE~@ytd}{EAXTTfVBoR42_IZFOTiDUpR~pVS^#hN}u6PcC9vN<&?56ufZy_R{LT}yjW@>v^hpw zX`x9i&u-4GqpQVWyLBbc4ywRl^&31g`l{RmU@o}G+Ul?!v@#sdhp{2FyY@ocMC%&x zns)_dVIf*|+)K46s^fGp1KQ$PMM19?nbv*c6+cU(BPx(- z!z#;t_xqS|qh@V7c5+-H|LeMzvyu{^xKD<#XR4`x-f@X)d&NRBUs3n_2J!0=yH4z% zCz|$oK%CJ6Dy`EIp?RK&t@Gt&Izn6L=~_v|MnY+=BsI`UsCod^B3P<3R?kM21goyu z%7-*l-3Y7i4jjany8l)mm<)itAPf3A)t{+7cfgtyeWt4vdz*HT9LRp_3eHa>xorF^ zM zqYA-pApUp6UPW=r+*t|4oiuWXWOF8DEz=%#PM!%!I7k&cGG{byzp@I^+*x`%lGFp< zykjBSpgqM`jM`~9OXK!3fN_@tR?Bl_0$x=J?g0R+1gwflGM9)|0)SNkQ%PjVDlBQU zt!J%(k3>#DA|LITY5a4dJU`H9V=9T}iNg&s3vPxp_ApRy||3GoH zDTWH@4Q}s0XiE&C9Wk`yh(27Q>j>InjsP2lx-SMk0cokNqo5z={7C4FS+tTOupqlD zcgBE!cHT_XjWM)iN!c0`U)q8;-KCh`hwRkBBdCNaW%l_VfP@q|ILXGGm{Nd`Jp?Bz zgZ;@SdTBj@&BBI!)aRN zfB+MQH>fx=e&_ zo?W0%&?X2PCD69~G!s#27nKK2`M#AD`O9iX6k0~tOSPK8RtqAkSr+7pb@U9XEV7#2 ztMo3$z=?X6;3mylLVIcKBZG%N`BdE4H>Y}S+8iAlE9p~cOv5tvyPB`DIPz+DucPK7 zCvAt880XH*(!WWamxVoGd3RWug?o|O%!uumgxVR}_6mju93rqi`#Qi1FScKvhmxQx zGZQbG79vZ$u#l-t3uFR7QQ!bU{L2lCDDl)Tf?a`2N@hhCc(~~@MrBusXjal8p>y+I z?bk7O)#`DUz;N)>9v<@e@DVS(82?vL1{eHb2U;eZ$^b7-9k4>YKdi`~E-%!6Q(HaP zO3m_3?Db>!;kUJ>sj>l^tRn9Ig=$fy$~!L1L3QrFYdw?sDw8+eo45znAyBll@6HEWrc8VXY8kL%bfhYF7#FuF#Tx)F zn~igWmRT=j3L$`$%5rm&B?o`$cV)~fC^Hq{1(9pagc5QRC3J^k3)I*h2Px;i?}2^< zLNk$*Q-tmiB_k)12|0{(&K%Mywp*`0Pd__qbR2ng60jHO^X--3 zGO^DQ2-UvyTLhVdPq||f^ymyYy$d@+xW;(xUG~ur?IqLH_|Y?c*e`1kkmf4#cT*GY zS|N79ye~RtMcW`tkplkSS4)9NE}M(5024>K z1f4-~<%vC-$EB$~CrA!~+jotLyKC3JDx@BK80;4k7Tr3g+e1Bi4trt3M6hO4c0OI8 zO^M%i{+*t{*6hB!_UCs&os)_8_Rqq3O_B!Mn8l{GU!em-r*li8dp2GQ$tc)TO#Km} zjLC?^hyz@=>b##wi355qG2##9dqZ?%Ac3)^tIlzQ?*zBF1({OW?JDX9VXd z1%sQ7XNA?SQLP5MR5J>B*D9imRJ|ISEm|P~n+?5Fy4Y-5UecN&2QU=;MH!k6vjs$C zHle4|Axa_6JShOorZg$L<^$^;(4=UqPUNVcTy-K@-9%_MuratOI>B0y5g{>oXqi%~ zq>Xhk7uBXir_Nn^_Uk`r@W?SQO?usHb)(Y-O{F-gOT+yq*-wnw2E2jemT%f3y7$9< zhCcq}^MYsOf^8r#ARAO~kD2yK9fT1Y5|KWTl^!=sbOY4_?$JwgXg#*drZ^nusfyY; z&Z6@Qh!~Z?i}Q#tLF{Q1cK|r4lpXdEM2YQujt+adL(-t14=fX%fUvslTwkIXfFR~1 z+~N!ZdQyrplDSEh#Ixkfwld0jV{6Jgt3^2zW1!IgaM98EAsoAmV?OlgtQlzqa`SZO zXCl~qO6sf`Ez_89!~*L<;namdwM20*j?>dum6Ck*Dcs%i*ydA5cu777vWVh8|7ELR59&Wp;LVlMA;jNnF^21)RpebuD+KRqdY7uMt)Ul_I zHTCL&B~anj6M^WGP?}4nR}X1d(W#H8zBv(#_==$Zj>);esHo$+C@|`K=x6!~C4RY* zFGlud;NGVEIhz4sESAN;NqA(7Htl13Kla3k7bd>u#Xh86dt37@*{CuoW)=200PLj# zRp~w;F5Nx|SV^xyo8PgA(gD8>SrZ$8%qq7Q>vITj6_E zU9}BvvBGN`9pE0ZhN4M8%Rv5g-qJgRY9D(+<$e_* zz5ucLa_bzSopS=?ZiT&b3g&i2^PEK74pFqs+b(v`DQVlqiF{-u4CHfzsP~p{2A)J z_#ge6I(MSp#V%Ii0usG>7??TDJ5Jf@30;U3XG0r3?J;tvCz;o?ywj5y_IcWwr6xN= z+x~!6a|qeG8=ZyHOF^@>uLIlq&1Z@*p`+B6I#+BP$)*CBkU%*%=u?Loa+Hr{@#xQJt6D%QGvBWQ9pN^G|Gv6NdgN@?3Y> z%H;TM3{}uLHBcA(H;rr+)3tBEAx{j425cA&>ow79TC^6h<~EAVQwgi$@psv8QvPjR zkr%A|oPze{);XGcP|&%OcnYTwIV@Z@!w|U_MonTt4q>^_0c5RWp`)mgk`x|TVQKQi zkQ1ILDdFMnBtm$2MLB5XN(bd3qNNUS03D?cA`rp@8J>a2 z$F4meA36FJV1R)_V^3{fpe^5;ZAse_f!NK?K%dzX4ro0J@STuJ+pj*(soP)waU(yQW~=Sbh1if`=n-EXRA^$Td; zw*6mv4jKZiQlrQJ&FhVQy|*2dDxa13C1l zwb~DRwq4NI{%`k25nhHF1__XgIc6#IyNKtF{)3||r57#zt(m5t^^nZ4Kn= zgYY)+e^Y$Fx~+kbg&k?D4Q;j)J8j@dm|YQE4CPF*(*`O!&P3x743|7as;?|sAvNn|@oJKqh z>O4*-z>viI>3sU~i3P&qCJd;DWlAI{C*%|)$0@z`XF>2bO{x+HKl!~>eIHy5EW12}$hFu84XQXqx) zAI0f@ln3-{u%xs$y)FE%^}*Z|km@;#U7(8CXL$3|yxCj&T1 zah}Nw{>rV8@|nCMm&g9~XjUtDL#Fbe)8Z+Yhg)NLVOA`mb9qH759mgLMrR3;0As`i zGGBltJE(j~`5bwmV-}~#J7W>(wPi4ZFhu?yWxg?1dp4*^%T}$s_Z;x}6Hh@m^-E)4 znc(%YE&PPm{`^I2$(r-c*rurw857UpQ=mp?BNBZ7)##4zQTzq81kn-%tfaNQ6jYNR z=e)+~?$Iim?m^S}E2lbcbUk2&cEm%RD#nhE2`e-a0cR1T zN5NNH7OfXO0yTxEHy-A``e)%y(x|HV_ZA`OUF%Z=`~ zxQrgYzNzDRDFN`+qdvFTk~(ZVt<4GL8?%jO#b#rQNX>zVdxSnd#lI@Kor?3g@v^E< z(gW+^#@#3mVRx(6^luUW4TAJGw1|~YVI}I3Z;l=@no2rpp&eobV7_$V8xbbxDPX=t znGn~ua2c<>iko!fWi{JOH?HX3A;$K@caVPlRI@_p{SZb^*5!LuQHc8(zko(9VgQNLr~LeU@SI1rGtKo! zlow9jH7?En9Mp*mz?yDpPVtux(F*6?G*;KQ??410q9ww->Gi(@bWNXqhEDT6g7@)| zBgw>9fMYRZ;%7lnsz^5NWYL!%dtsW-d$8$=QPRq~2bb~uP*)H$VJ?$_BLom*N61ZxB6RufIgb}-4{y5) zhSb9=d3Lz5jWN#Q-e(HpLp7H>n$osS7s-!WdM?|G-RM)5OCWC_(*yN5J|gzvM@K$8 z>V-F`EzzGFzj;(41d>7rMRAKPym|_y89Uef45Yz}V4%bNe!0N&x$2%WC7NgJ-lkR+ z^|oD^5+XqhAn2wqceDfpLT4e9?_UH4H%%h}ac1074MUbGx0H=2LXXE)puH3u2_Rb0 zaBh(+R@$;Q+qyX!DQCSnHxiH;9pVWA;^xEPCY*g&xYLS>n~!3$+zd8^au`#OtO|Om z_h74SA6_+nmbTB!6Evna$o6Er>a1PvNvd#^XHSfMU*!YcM*Ix~OkQzN11A71S533f zmcVDSv;fXA8+I zs6k}{2?CvFOA@%fVbG5sukO9S`8SG^*ys;_O(N9gm(lyU0!x=PHC2DMEwontiuYN| zn7k)e_c`;H)P8@rN^ez`eDC{x{rTXKb}_L4Y8*IZ?xN1Dk!ArW>m|Wh*Bn3js z#0tC#Cn4t|%oHxd@Z{4h`|;Ix(`F*C^soN;*ibUx zaL4ETsQ#q$S)q9WIRY9*Kpw0x?FT&9Q2+x1ZR^OS|8Uwd<$nUp(|3XUfA_V6;04@nur!`-?2rhXXlf!^P^+W`4QSYY0sq5Rqi9B^( zB95V#0g}{$w(w_fTRWq+U1=7D?IO!K0E6iouDjN+s)}V|Obr5Cwd>OT!2tt@J~L+g z#MdT+WzuhI%*2};r~75$r7-p$Fm-e;Ru%?;+(`h!0R(P86j?ICGF{=k#%GfuTF}eA zqE2qAMr96TT>yV*C@1;+L>$&aEA1ix2pEE$C$$I;zY+oze;5-AACb{N85ZR<3_4YX z;(tgfmkW4m0$n5X;IBNQ<-M;EIb(IqXh~Rrl<*&ND2k$g;B$`N5wa|rs9Q&hUpxZP z6#@GLRATeH+L$!B9|!d3msoi+M&`Ym>oT@V(6_=hYSyJ&_dbsd95!;y^J6Fc-PM4O zv+vTF?l9!SsEMS*Noea1%&zsNke~!GOyH_BUjJzN33pGAM5~OBRQ#E{*1dBvsx7^p zgJc7EBEYuXvHTYhN{L(v%>Sb!XCb_z6K4q>IIAYv07?m<*&wOr+3zFBDx2b+$)AKK zb0z(gq6P~S+>#L4(wr7yY4sam%)cK->j17P7h#jS(iUh4Gqvm{QW1_sA%_7_P>Nj-QlcQfG6JG-dPk)3Ho#0E!nXXT`6S5#O e*_8BL8E{(VV@lcdU2i(M^Z-uhz1HS<_x?Yt#L&k8 literal 0 HcmV?d00001 diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index af88d889..24d0ed07 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -66,6 +66,45 @@ async def _handle_discover(self, request): return web.json_response(pod_ip_list) return web.json_response(status=408) # Timeout. + async def _handle_discover_gpu(self, request): + # Long-polling endpoint used for discoverin + # pod IPs and GPU for a given job. + namespace = request.match_info["namespace"] + name = request.match_info["name"] + group = request.match_info["group"] + timeout = int(request.query.get("timeout", "30")) + pod_ip_list = None + pod_gpu_list = None + async with kubernetes.watch.Watch() as w: + stream = w.stream(self._core_api.list_namespaced_pod, namespace, + label_selector="adaptdl/job={}".format(name), + field_selector="status.podIP!=", + timeout_seconds=timeout) + async for event in stream: + pod = event["object"] + replicas = int(pod.metadata.annotations["adaptdl/replicas"]) + rank = int(pod.metadata.annotations["adaptdl/rank"]) + if pod.metadata.annotations["adaptdl/group"] == group: + if pod_ip_list is None: + pod_ip_list = [None] * replicas + pod_ip_list[rank] = pod.status.pod_ip + if pod_gpu_list is None: + pod_gpu_list = [None] * replicas + container = pod.spec.containers + assert len(container) == 1 + pod_gpu_list[rank] = \ + int(container[0].resources.requests[ + 'nvidia.com/gpu']) + if all(pod_gpu is not None for pod_gpu in pod_gpu_list)\ + and all(pod_ip is not None + for pod_ip in pod_ip_list): + assert len(pod_ip_list) == len(pod_gpu_list) + return_list = [(pod_ip_list[i], pod_gpu_list[i]) + for i in range(len(pod_ip_list))] + LOG.info(return_list) + return web.json_response(return_list) + return web.json_response(status=408) # Timeout. + async def _handle_report(self, request): namespace = request.match_info['namespace'] name = request.match_info['name'] @@ -85,6 +124,8 @@ def run(self): web.get('/healthz', self._handle_healthz), web.get('/discover/{namespace}/{name}/{group}', self._handle_discover), + web.get('/discover_gpu/{namespace}/{name}/{group}', + self._handle_discover_gpu), web.put('/hints/{namespace}/{name}', self._handle_report), ]) LOG.info("%s %s", self._host, self._port) From 17f8f4989278572217844d98f0c3402c28c7c7f3 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 30 Nov 2020 03:22:21 +0000 Subject: [PATCH 2/9] move write_config to autodist --- adaptdl/adaptdl/torch/__init__.py | 40 ------------------------------- 1 file changed, 40 deletions(-) diff --git a/adaptdl/adaptdl/torch/__init__.py b/adaptdl/adaptdl/torch/__init__.py index 73d027e0..dee89537 100644 --- a/adaptdl/adaptdl/torch/__init__.py +++ b/adaptdl/adaptdl/torch/__init__.py @@ -30,8 +30,6 @@ from .data import current_dataloader, AdaptiveDataLoader, ElasticSampler from .parallel import AdaptiveDataParallel from .accumulator import Accumulator -import os -import getpass logging.basicConfig(level=logging.INFO) LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) @@ -66,44 +64,6 @@ def init_process_group(backend): LOG.info("torch.distributed initialized") -def write_config(): - url = adaptdl.env.supervisor_url() - if url: - key = adaptdl.env.job_id() - group = adaptdl.env.num_restarts() - while True: - response = requests.get(url=f"{url}/discover_gpu/{key}/{group}") - if response.status_code != 408: # Timeout. - break - response.raise_for_status() - master_addr = response.json()[0][0] - else: - raise ValueError("supervisor url not found.") - # write to the share path - path = os.path.join(adaptdl.env.share_path(), "resource_spec.yml") - LOG.info(f"writing to {path}") - - f = open(path, "w") - f.write("nodes: \n") - num_nodes = len(response.json()) - for i in range(num_nodes): - f.write(f" - address: {response.json()[i][0]} \n") - f.write(f" gpus: {list(range(response.json()[i][1]))} \n") - if i == 0: # chief - master_addr = response.json()[i][0] - f.write(" chief: true \n") - else: - f.write(" ssh_config: conf \n") - f.write("ssh: \n") - f.write(" conf: \n") - f.write(f" username: '{getpass.getuser()}' \n") - f.write(" key_file: '/root/.ssh/id_rsa' \n") - f.close() - # Initialize collective module. - master_port = adaptdl.env.master_port() - adaptdl.collective.initialize(master_addr, master_port) - - __all__ = [ "init_process_group", "current_epoch", From 95c77f80aa0a39c4252c842cd7a6bf30ba1b0509 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 30 Nov 2020 03:59:18 +0000 Subject: [PATCH 3/9] --- adaptdl/adaptdl/torch/__init__.py | 2 + examples/autodist/Dockerfile | 93 +++++++++++++++++++++++++ examples/autodist/adaptdljob.yaml | 25 +++++++ examples/autodist/bert_config.json | 13 ++++ examples/autodist/tf_examples.tfrecord | Bin 0 -> 49090 bytes 5 files changed, 133 insertions(+) create mode 100644 examples/autodist/Dockerfile create mode 100644 examples/autodist/adaptdljob.yaml create mode 100644 examples/autodist/bert_config.json create mode 100644 examples/autodist/tf_examples.tfrecord diff --git a/adaptdl/adaptdl/torch/__init__.py b/adaptdl/adaptdl/torch/__init__.py index dee89537..d453d8ba 100644 --- a/adaptdl/adaptdl/torch/__init__.py +++ b/adaptdl/adaptdl/torch/__init__.py @@ -24,12 +24,14 @@ import portpicker import requests import torch.distributed + import adaptdl.collective import adaptdl.env from .epoch import current_epoch, finished_epochs, remaining_epochs_until from .data import current_dataloader, AdaptiveDataLoader, ElasticSampler from .parallel import AdaptiveDataParallel from .accumulator import Accumulator + logging.basicConfig(level=logging.INFO) LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) diff --git a/examples/autodist/Dockerfile b/examples/autodist/Dockerfile new file mode 100644 index 00000000..a7cb9c28 --- /dev/null +++ b/examples/autodist/Dockerfile @@ -0,0 +1,93 @@ +# Copyright 2020 Petuum, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM python:3.6.12-buster +WORKDIR /root + +FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime + +FROM tensorflow/tensorflow:2.2.0-gpu + +# Set default shell to /bin/bash +# SHELL ["/bin/bash", "-cu"] + +# RUN rm -rf /etc/bash.bashrc + +# Install apps +COPY adaptdl adaptdl +COPY examples/requirements.txt . + +RUN cd adaptdl && python3 setup.py bdist_wheel + +ARG ADAPTDL_VERSION=0.0.0 +RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl +RUN pip install -r requirements.txt + +RUN rm -rf adaptdl/dist +COPY examples examples_adaptdl +#COPY examples examples +#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils + +# autodist env +SHELL ["/bin/bash", "-cu"] + +RUN rm -rf /etc/bash.bashrc + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + wget \ + unzip +WORKDIR /root +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +COPY bert_config.json bert_config.json +COPY tf_examples.tfrecord tf_examples.tfrecord +COPY autodist autodist +RUN cd autodist +RUN pip install tensorflow_hub +RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip +COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip +RUN unzip protoc-3.11.0-linux-x86_64.zip +RUN PROTOC=autodist/bin/protoc python autodist/setup.py build +WORKDIR autodist +RUN rm ./examples/resource_spec.yml +RUN pip install -e .[dev] + +# setup ssh +# Install OpenSSH to communicate between containers +RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd + +WORKDIR /root +RUN mkdir /root/.ssh +RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys +RUN chown -R root /root/.ssh +RUN chmod 700 /root/.ssh +RUN chmod 600 /root/.ssh/authorized_keys + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ENV PYTHONUNBUFFERED=true diff --git a/examples/autodist/adaptdljob.yaml b/examples/autodist/adaptdljob.yaml new file mode 100644 index 00000000..ed364fa9 --- /dev/null +++ b/examples/autodist/adaptdljob.yaml @@ -0,0 +1,25 @@ +apiVersion: adaptdl.petuum.com/v1 +kind: AdaptDLJob +metadata: + generateName: integration- +spec: + minReplicas: 2 + template: + spec: + containers: + - name: main + command: + - python3 + - /root/autodist/examples/benchmark/bert.py + - -input_files=/root/tf_examples.tfrecord + - --bert_config_file=/root/bert_config.json + - --num_train_epochs=1 + - --num_steps_per_epoch=1000 + - --learning_rate=5e-5 + - --steps_per_loop=1 + - --autodist_strategy=PS + resources: + limits: + nvidia.com/gpu: 1 + + diff --git a/examples/autodist/bert_config.json b/examples/autodist/bert_config.json new file mode 100644 index 00000000..a7efa973 --- /dev/null +++ b/examples/autodist/bert_config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "type_vocab_size": 2, + "vocab_size": 30522 +} diff --git a/examples/autodist/tf_examples.tfrecord b/examples/autodist/tf_examples.tfrecord new file mode 100644 index 0000000000000000000000000000000000000000..49bad5ad29814410b34d075ea394f9f714d16691 GIT binary patch literal 49090 zcmeHw31HN8_J6-snlT*T%yfDMN?S@PwA|%d3d$jZAeRD)3V5|&5n(}ab=5@*SfE%r zE3(3Zt%yPw1qB5Q2nZ`|TUw45N(+_4a#dhDh;shkH%BMQWYU>l3;zG?(McwgWHRr4 z-}QN)*Wo@d;onQ)pI>Fy`>@^F`s1IUJTC5~F|UpvGxnv)mX>@J+dQvfCSK5VKhDI* za276(%EZf}v+xm|Y|5R6z8{*6ui_lMf%9U=vS!D}--sJaUY;`9GR*QgJH*Rt+C$a% zKQAx#L&Vmd97o2-O_(CwXd>cgAa)uOzfX1kKl11wJHwlOVC;m6uZy>J74bI2UP5t) z@dw-pev%mTtips*rqHIhiP7fw!%%SisOh|(1E3OdU z{@6?p%8J{Mk66fJZuV}JiT7CNceOETa5uUB@%zF`@E6=Ad>sy36PRX6H!ly}=)WJ6 z#343j8C>S-yI0`$=7hoBX5l+<7uop4+owaxC5~j`w3&$fiuYN|n7k)e_c`;{b?aiz z&cw4Rh-Jm6Kn21g|2iH$${IMB4Q8zq;@+GxW-`c4+=Lh7#=QFc3vsVbw$$P8V-aH8 zu)Yofp7__vW2cP$>x9XcK;ECV)?w<^Zx9&TIksEh(d6Z^<0roMx)*vNZTWd`QQE$G z)aMpkQm1l~>HJ-*Q-pb*F9NKpd#pebuviyQC;xYzgFVRYV z013&*#UV*Sq$~^DAaH|-5a^DWrOo6i)6{_M9r z%*M>)a=>P+^vephF|#@q4@(VLVVZAF@ZHerEc53u9?oj)(H?e|%6R;N_}>s)hT=#O zF2Ferzw1qMFqvyjHeCX3Jswto&-sDc(G~dsK8bIfK>z+4!7}{;mk^lFuNqm(tT5%^ zy-hOlPm7S90Y%590gJ=!46!qXQ^|s+nfO8w{d8WyKHkpkG_T7?8#rp`S9Q5!NsU;} z#S^$<#zY%)4xhq{!6@0nJ_|~HI}0y*rFfW)IU7-gcLmYQTzKX3rwGgx{Jvl%7L41= zm4bOowa(5*n}jpMUU+M}-@h8!nLECP_$b)JLOdGOUK@H|I(0=dAZdfSE9|dz*ch!EHQ__q#s~3^ z6!g!)6{eGxLU>SRjEzw!YL#6t#kneY>5+kRPy8#mqML8C`&0%l1M!;?y9veJ!X(+0 z-tP2=DDi#t@8;mzT*M%qyXT-B5cTQ9KpW!k_!3=-1Q&-2phY;IOKAP~LSOsAu*IZQEXP4gb6| zADodCz#h2c279uOp{4XO)o<23x|QM#5De&B%I`h}U-y{hW~*FWgtzxsf28&-rQ7;a zp{}UDLDe^^O~k5xE2=ZFKdAJ01LD6#Y&wc7#3|+r_#0Y;(+#U{ZLY z)GWtYqDq>QmbCuTg=P9Q*y5j}J>i z|5N1IaP~kQj~DRrYkB;_ZGnU{Yh}{oi+S<^_G{G-zY6iGh+To=a)U`p2BIN`{NsYp zLt_jg{m1WN%{wA9^WAk^&cgi^%FeC+jA>^~Y1$_~-Vy)6oeG z7qQ0>5>HOz`Ih;Oui-p6*M%U6nIcy%gG+DsyO0LyAbMAy|Lvbo6yXGJZ`ib7(8t{8 zt>}eU_qmM=+GgX6|3Z7)+8JePhZtmbq&yyxlRJM(@7t%IuU|mZh}P|5I&|&+V2`0M zPk77AMZPm17vc*kDA`mTxD1kDnGjpeF=yiC-2N~-^EG!8@?=LK5n7ue>j$=m)BhqV z(}E{wKf-Qum8?xR754aL8oFsdf>V4?;B0(;q>VXmF2d)row+Wg_^xm#-mcA?uQ9cE zUKBi2e0H`A5FsaoTnGQ=R6rTCxXJaSXv-8kQ;drn=fInpc=6tRGflVWtwP@cpw*QEyM!qz` zYe>X`PIV+EMM0^fnvy@gTAAZqtvs8!`c8S0OMzd{# zWgR~o{TF6CQCq2lcsIDbnDfBkbGvglh%8R;KSLl&5dt`6bJ4Jhf>Rsq(E>2 z>@m5)MX*wrf3C8Ds$RWKKB@lSc9xPi2S}>%kC9P3*q|{dh@W9 zzvrzeW{@2VkyOke(@e#mqq6$;`p(4+NTsXF8S6%6<22LeQH7A>q1gkvq;Fb;XpL=0 z%iV6-l zZoT@xu{|H|+yAir!`Gzk;&WMxkP9V=iNRNH%vH ze$=v5bOxS&ZF)ct-?xy{#zNJp*m}w?@2hOFymE@^(WhL6UD@~+o*9#e)7#|*CG~(@ z?cw$45|pb~#mo%7)N#%l2!-}oKY|GCUyU|!`S`+mMCO906i*VDBI*;B=70^O5fmY= z;-`~X6F-4o5;1xrV(dhmtKvj-wBV$2QC#6@!3k(xT;*sHb!D8Kg}TdGX3gM}c-Ode z@X#TaJcG~Se9-GZ0{@hX`#=yM92U{6OV@5a9v<}gDDu}yP%G4jnXE8n<9)Hormr~~ z_#co}T`3RbajD(J19oj$dpWT!&Aw|bEM$j2~cZ1r!{E~aCbhaVd> zc*IkqMcc=A(XJhld;}Z*kyS_sg-LHM0$;j(DWXDbhmvMqz;4;ur;Ts{*gHH7troc<0ivfOH%hrwV~TuiFKOX+uUDUEx1b-**gjXrz&m*=(S)PD3gogb(QbhLe0iO`+r#oPUeECt*=u!-FJYWeH#M;GO`Erj>E1)W zVQ|az3;m_cOu)zNi%l>sh|I=?Jr-FmD^ly#qJfTkFnU!v4RlY(jILYZdo?J76<*uu z0QR72W!R1Qorv9z;?y>TD72p_yjh~KAXNJ`76jbO0-S0lq5&b)e%y$m%gjj&SRdJf zltNUv(;sZe=6pnr$kx&-S2xE+#%}2(*^uE(Y=8D^ABeP-OW-xc+Yx&Oa7qqjgpldf z7+KedR5HIS5%H*5c;BSmZ=RsO9xPu%wO6&A(wLkYQ!+D#$l-zFfvVzzgdRi+4;(g` zT5w4GN-rYNR_h6~DXtYnJBG)zqf^ll=E!+c7bc7Wses~z(# zV@LcI#Qq8hX22a%M5!98NJOL}0p(WW6k$_^Qlt=61x_)+mjV?xv;u6G3{-@CoC2jt zWRPM!$-|I(qoDr0hSy*rrm%iEyVxlq1N*_p7jgZ^03)pg_Ge)St{*-n!}^H>)t}Hv zgX*V!7x7imZKfV)4hn7^)1hN*&)$y>e|f?bm%%uNdQy-lz&cX{{?$kXYaYdG%!h@f z=%-MnqOBmcquXMHkeGNh_901y23-B|H#Wl+Foe?DxoI-8AxH1 zD@+1zl1v|V(qd=6<$uQOxzhGmpfhMuL>9kg7$NVOAYliA^sn6U=xgxUPg@G`mSJKt zZ)YN+r#J-ZoNRpVpJ-bq%HjZ>xpKfUI9q%HzCIz)xQJRjUr+~ z9!~0%kB5EAbUewNNF2#{n)(%T@dQ%wHb#|;CztUxnh}O%JQTPTx%hiD3uhdDV7^gs zXp>f*`VJWW?C6)rPkL?2n_k5)6;6wm)ajk*DZj9v`_#Chb_j5dD1_tm%2ySthw@;r zyU6&d@9UN-xVVJ)i-Y@sYF7=P# z(cQ63OGJ5n=!^eCoRP?t;QXfK7f_W_$XaFMr2%A1|Gj*EXcpe;6!W#|8}_@$|FL!INdX zYV%+%_((S5XIQJCAp);3rT8x4wnZ%kC`PLJP6$OGQp~;3anrZ3Q3tIhsXfNAr#=GgFb`a4GDY(=rp-@LbS*; zp>`54R!7M0H;$UWA1Z@TjEW=BnMk4aVsV6AdPb~`P)|)RjsWn+&Lo?zC8GT_k($xO zM)Q))+8~gq0=#V#tnb^CP||Z-ny2$=e-+<6iGM$g)^V2s&UGC=OO~4cl#4s`B?dKa z+NwjxP7n7V`uM09$=H8*u|2fX-G*<)MpJ+Y)W@;_KnSowD6U9Qp7? zn0Y-RYO8TuAlE~@ytd}{EAXTTfVBoR42_IZFOTiDUpR~pVS^#hN}u6PcC9vN<&?56ufZy_R{LT}yjW@>v^hpw zX`x9i&u-4GqpQVWyLBbc4ywRl^&31g`l{RmU@o}G+Ul?!v@#sdhp{2FyY@ocMC%&x zns)_dVIf*|+)K46s^fGp1KQ$PMM19?nbv*c6+cU(BPx(- z!z#;t_xqS|qh@V7c5+-H|LeMzvyu{^xKD<#XR4`x-f@X)d&NRBUs3n_2J!0=yH4z% zCz|$oK%CJ6Dy`EIp?RK&t@Gt&Izn6L=~_v|MnY+=BsI`UsCod^B3P<3R?kM21goyu z%7-*l-3Y7i4jjany8l)mm<)itAPf3A)t{+7cfgtyeWt4vdz*HT9LRp_3eHa>xorF^ zM zqYA-pApUp6UPW=r+*t|4oiuWXWOF8DEz=%#PM!%!I7k&cGG{byzp@I^+*x`%lGFp< zykjBSpgqM`jM`~9OXK!3fN_@tR?Bl_0$x=J?g0R+1gwflGM9)|0)SNkQ%PjVDlBQU zt!J%(k3>#DA|LITY5a4dJU`H9V=9T}iNg&s3vPxp_ApRy||3GoH zDTWH@4Q}s0XiE&C9Wk`yh(27Q>j>InjsP2lx-SMk0cokNqo5z={7C4FS+tTOupqlD zcgBE!cHT_XjWM)iN!c0`U)q8;-KCh`hwRkBBdCNaW%l_VfP@q|ILXGGm{Nd`Jp?Bz zgZ;@SdTBj@&BBI!)aRN zfB+MQH>fx=e&_ zo?W0%&?X2PCD69~G!s#27nKK2`M#AD`O9iX6k0~tOSPK8RtqAkSr+7pb@U9XEV7#2 ztMo3$z=?X6;3mylLVIcKBZG%N`BdE4H>Y}S+8iAlE9p~cOv5tvyPB`DIPz+DucPK7 zCvAt880XH*(!WWamxVoGd3RWug?o|O%!uumgxVR}_6mju93rqi`#Qi1FScKvhmxQx zGZQbG79vZ$u#l-t3uFR7QQ!bU{L2lCDDl)Tf?a`2N@hhCc(~~@MrBusXjal8p>y+I z?bk7O)#`DUz;N)>9v<@e@DVS(82?vL1{eHb2U;eZ$^b7-9k4>YKdi`~E-%!6Q(HaP zO3m_3?Db>!;kUJ>sj>l^tRn9Ig=$fy$~!L1L3QrFYdw?sDw8+eo45znAyBll@6HEWrc8VXY8kL%bfhYF7#FuF#Tx)F zn~igWmRT=j3L$`$%5rm&B?o`$cV)~fC^Hq{1(9pagc5QRC3J^k3)I*h2Px;i?}2^< zLNk$*Q-tmiB_k)12|0{(&K%Mywp*`0Pd__qbR2ng60jHO^X--3 zGO^DQ2-UvyTLhVdPq||f^ymyYy$d@+xW;(xUG~ur?IqLH_|Y?c*e`1kkmf4#cT*GY zS|N79ye~RtMcW`tkplkSS4)9NE}M(5024>K z1f4-~<%vC-$EB$~CrA!~+jotLyKC3JDx@BK80;4k7Tr3g+e1Bi4trt3M6hO4c0OI8 zO^M%i{+*t{*6hB!_UCs&os)_8_Rqq3O_B!Mn8l{GU!em-r*li8dp2GQ$tc)TO#Km} zjLC?^hyz@=>b##wi355qG2##9dqZ?%Ac3)^tIlzQ?*zBF1({OW?JDX9VXd z1%sQ7XNA?SQLP5MR5J>B*D9imRJ|ISEm|P~n+?5Fy4Y-5UecN&2QU=;MH!k6vjs$C zHle4|Axa_6JShOorZg$L<^$^;(4=UqPUNVcTy-K@-9%_MuratOI>B0y5g{>oXqi%~ zq>Xhk7uBXir_Nn^_Uk`r@W?SQO?usHb)(Y-O{F-gOT+yq*-wnw2E2jemT%f3y7$9< zhCcq}^MYsOf^8r#ARAO~kD2yK9fT1Y5|KWTl^!=sbOY4_?$JwgXg#*drZ^nusfyY; z&Z6@Qh!~Z?i}Q#tLF{Q1cK|r4lpXdEM2YQujt+adL(-t14=fX%fUvslTwkIXfFR~1 z+~N!ZdQyrplDSEh#Ixkfwld0jV{6Jgt3^2zW1!IgaM98EAsoAmV?OlgtQlzqa`SZO zXCl~qO6sf`Ez_89!~*L<;namdwM20*j?>dum6Ck*Dcs%i*ydA5cu777vWVh8|7ELR59&Wp;LVlMA;jNnF^21)RpebuD+KRqdY7uMt)Ul_I zHTCL&B~anj6M^WGP?}4nR}X1d(W#H8zBv(#_==$Zj>);esHo$+C@|`K=x6!~C4RY* zFGlud;NGVEIhz4sESAN;NqA(7Htl13Kla3k7bd>u#Xh86dt37@*{CuoW)=200PLj# zRp~w;F5Nx|SV^xyo8PgA(gD8>SrZ$8%qq7Q>vITj6_E zU9}BvvBGN`9pE0ZhN4M8%Rv5g-qJgRY9D(+<$e_* zz5ucLa_bzSopS=?ZiT&b3g&i2^PEK74pFqs+b(v`DQVlqiF{-u4CHfzsP~p{2A)J z_#ge6I(MSp#V%Ii0usG>7??TDJ5Jf@30;U3XG0r3?J;tvCz;o?ywj5y_IcWwr6xN= z+x~!6a|qeG8=ZyHOF^@>uLIlq&1Z@*p`+B6I#+BP$)*CBkU%*%=u?Loa+Hr{@#xQJt6D%QGvBWQ9pN^G|Gv6NdgN@?3Y> z%H;TM3{}uLHBcA(H;rr+)3tBEAx{j425cA&>ow79TC^6h<~EAVQwgi$@psv8QvPjR zkr%A|oPze{);XGcP|&%OcnYTwIV@Z@!w|U_MonTt4q>^_0c5RWp`)mgk`x|TVQKQi zkQ1ILDdFMnBtm$2MLB5XN(bd3qNNUS03D?cA`rp@8J>a2 z$F4meA36FJV1R)_V^3{fpe^5;ZAse_f!NK?K%dzX4ro0J@STuJ+pj*(soP)waU(yQW~=Sbh1if`=n-EXRA^$Td; zw*6mv4jKZiQlrQJ&FhVQy|*2dDxa13C1l zwb~DRwq4NI{%`k25nhHF1__XgIc6#IyNKtF{)3||r57#zt(m5t^^nZ4Kn= zgYY)+e^Y$Fx~+kbg&k?D4Q;j)J8j@dm|YQE4CPF*(*`O!&P3x743|7as;?|sAvNn|@oJKqh z>O4*-z>viI>3sU~i3P&qCJd;DWlAI{C*%|)$0@z`XF>2bO{x+HKl!~>eIHy5EW12}$hFu84XQXqx) zAI0f@ln3-{u%xs$y)FE%^}*Z|km@;#U7(8CXL$3|yxCj&T1 zah}Nw{>rV8@|nCMm&g9~XjUtDL#Fbe)8Z+Yhg)NLVOA`mb9qH759mgLMrR3;0As`i zGGBltJE(j~`5bwmV-}~#J7W>(wPi4ZFhu?yWxg?1dp4*^%T}$s_Z;x}6Hh@m^-E)4 znc(%YE&PPm{`^I2$(r-c*rurw857UpQ=mp?BNBZ7)##4zQTzq81kn-%tfaNQ6jYNR z=e)+~?$Iim?m^S}E2lbcbUk2&cEm%RD#nhE2`e-a0cR1T zN5NNH7OfXO0yTxEHy-A``e)%y(x|HV_ZA`OUF%Z=`~ zxQrgYzNzDRDFN`+qdvFTk~(ZVt<4GL8?%jO#b#rQNX>zVdxSnd#lI@Kor?3g@v^E< z(gW+^#@#3mVRx(6^luUW4TAJGw1|~YVI}I3Z;l=@no2rpp&eobV7_$V8xbbxDPX=t znGn~ua2c<>iko!fWi{JOH?HX3A;$K@caVPlRI@_p{SZb^*5!LuQHc8(zko(9VgQNLr~LeU@SI1rGtKo! zlow9jH7?En9Mp*mz?yDpPVtux(F*6?G*;KQ??410q9ww->Gi(@bWNXqhEDT6g7@)| zBgw>9fMYRZ;%7lnsz^5NWYL!%dtsW-d$8$=QPRq~2bb~uP*)H$VJ?$_BLom*N61ZxB6RufIgb}-4{y5) zhSb9=d3Lz5jWN#Q-e(HpLp7H>n$osS7s-!WdM?|G-RM)5OCWC_(*yN5J|gzvM@K$8 z>V-F`EzzGFzj;(41d>7rMRAKPym|_y89Uef45Yz}V4%bNe!0N&x$2%WC7NgJ-lkR+ z^|oD^5+XqhAn2wqceDfpLT4e9?_UH4H%%h}ac1074MUbGx0H=2LXXE)puH3u2_Rb0 zaBh(+R@$;Q+qyX!DQCSnHxiH;9pVWA;^xEPCY*g&xYLS>n~!3$+zd8^au`#OtO|Om z_h74SA6_+nmbTB!6Evna$o6Er>a1PvNvd#^XHSfMU*!YcM*Ix~OkQzN11A71S533f zmcVDSv;fXA8+I zs6k}{2?CvFOA@%fVbG5sukO9S`8SG^*ys;_O(N9gm(lyU0!x=PHC2DMEwontiuYN| zn7k)e_c`;H)P8@rN^ez`eDC{x{rTXKb}_L4Y8*IZ?xN1Dk!ArW>m|Wh*Bn3js z#0tC#Cn4t|%oHxd@Z{4h`|;Ix(`F*C^soN;*ibUx zaL4ETsQ#q$S)q9WIRY9*Kpw0x?FT&9Q2+x1ZR^OS|8Uwd<$nUp(|3XUfA_V6;04@nur!`-?2rhXXlf!^P^+W`4QSYY0sq5Rqi9B^( zB95V#0g}{$w(w_fTRWq+U1=7D?IO!K0E6iouDjN+s)}V|Obr5Cwd>OT!2tt@J~L+g z#MdT+WzuhI%*2};r~75$r7-p$Fm-e;Ru%?;+(`h!0R(P86j?ICGF{=k#%GfuTF}eA zqE2qAMr96TT>yV*C@1;+L>$&aEA1ix2pEE$C$$I;zY+oze;5-AACb{N85ZR<3_4YX z;(tgfmkW4m0$n5X;IBNQ<-M;EIb(IqXh~Rrl<*&ND2k$g;B$`N5wa|rs9Q&hUpxZP z6#@GLRATeH+L$!B9|!d3msoi+M&`Ym>oT@V(6_=hYSyJ&_dbsd95!;y^J6Fc-PM4O zv+vTF?l9!SsEMS*Noea1%&zsNke~!GOyH_BUjJzN33pGAM5~OBRQ#E{*1dBvsx7^p zgJc7EBEYuXvHTYhN{L(v%>Sb!XCb_z6K4q>IIAYv07?m<*&wOr+3zFBDx2b+$)AKK zb0z(gq6P~S+>#L4(wr7yY4sam%)cK->j17P7h#jS(iUh4Gqvm{QW1_sA%_7_P>Nj-QlcQfG6JG-dPk)3Ho#0E!nXXT`6S5#O e*_8BL8E{(VV@lcdU2i(M^Z-uhz1HS<_x?Yt#L&k8 literal 0 HcmV?d00001 From 5f6830499c918566dbb66ec3c66db26fdf7fb61f Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 7 Dec 2020 02:06:53 +0000 Subject: [PATCH 4/9] merge two functions --- adaptdl/adaptdl/torch/__init__.py | 3 ++- adaptdl/requirements.txt | 1 + sched/adaptdl_sched/supervisor.py | 20 ++++++++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/adaptdl/adaptdl/torch/__init__.py b/adaptdl/adaptdl/torch/__init__.py index d453d8ba..227dd146 100644 --- a/adaptdl/adaptdl/torch/__init__.py +++ b/adaptdl/adaptdl/torch/__init__.py @@ -43,7 +43,8 @@ def init_process_group(backend): key = adaptdl.env.job_id() group = adaptdl.env.num_restarts() while True: - response = requests.get(url=f"{url}/discover/{key}/{group}") + response = requests.get(url=f"{url}/discover/{key}/{group}", + params={'gpu': False}) if response.status_code != 408: # Timeout. break response.raise_for_status() diff --git a/adaptdl/requirements.txt b/adaptdl/requirements.txt index 4bd640b2..e3aac7ce 100644 --- a/adaptdl/requirements.txt +++ b/adaptdl/requirements.txt @@ -3,3 +3,4 @@ pandas>=0.24.2 portpicker>=1.3.1 redis>=3.3.8 scipy>=1.3.0 +jsonpath-ng>=1.5.2 diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index 24d0ed07..eaa69875 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -49,6 +49,7 @@ async def _handle_discover(self, request): group = request.match_info["group"] timeout = int(request.query.get("timeout", "30")) pod_ip_list = None + pod_gpu_list = None async with kubernetes.watch.Watch() as w: stream = w.stream(self._core_api.list_namespaced_pod, namespace, label_selector="adaptdl/job={}".format(name), @@ -62,6 +63,23 @@ async def _handle_discover(self, request): if pod_ip_list is None: pod_ip_list = [None] * replicas pod_ip_list[rank] = pod.status.pod_ip + if request.rel_url.query["gpu"]: + if pod_gpu_list is None: + pod_gpu_list = [None] * replicas + container = pod.spec.containers + assert len(container) == 1 + pod_gpu_list[rank] = \ + int(container[0].resources.requests[ + 'nvidia.com/gpu']) + if all(pod_gpu is not None + for pod_gpu in pod_gpu_list) and \ + all(pod_ip is not None + for pod_ip in pod_ip_list): + assert len(pod_ip_list) == len(pod_gpu_list) + return_list = [(pod_ip_list[i], pod_gpu_list[i]) + for i in range(len(pod_ip_list))] + LOG.info(return_list) + return web.json_response(return_list) if all(pod_ip is not None for pod_ip in pod_ip_list): return web.json_response(pod_ip_list) return web.json_response(status=408) # Timeout. @@ -124,8 +142,6 @@ def run(self): web.get('/healthz', self._handle_healthz), web.get('/discover/{namespace}/{name}/{group}', self._handle_discover), - web.get('/discover_gpu/{namespace}/{name}/{group}', - self._handle_discover_gpu), web.put('/hints/{namespace}/{name}', self._handle_report), ]) LOG.info("%s %s", self._host, self._port) From dbb39cc2323da6429822485aae98718fa0858152 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 7 Dec 2020 03:25:27 +0000 Subject: [PATCH 5/9] clean --- adaptdl/adaptdl/torch/__init__.py | 3 +-- adaptdl/requirements.txt | 1 - sched/adaptdl_sched/supervisor.py | 41 +------------------------------ 3 files changed, 2 insertions(+), 43 deletions(-) diff --git a/adaptdl/adaptdl/torch/__init__.py b/adaptdl/adaptdl/torch/__init__.py index 227dd146..d453d8ba 100644 --- a/adaptdl/adaptdl/torch/__init__.py +++ b/adaptdl/adaptdl/torch/__init__.py @@ -43,8 +43,7 @@ def init_process_group(backend): key = adaptdl.env.job_id() group = adaptdl.env.num_restarts() while True: - response = requests.get(url=f"{url}/discover/{key}/{group}", - params={'gpu': False}) + response = requests.get(url=f"{url}/discover/{key}/{group}") if response.status_code != 408: # Timeout. break response.raise_for_status() diff --git a/adaptdl/requirements.txt b/adaptdl/requirements.txt index e3aac7ce..4bd640b2 100644 --- a/adaptdl/requirements.txt +++ b/adaptdl/requirements.txt @@ -3,4 +3,3 @@ pandas>=0.24.2 portpicker>=1.3.1 redis>=3.3.8 scipy>=1.3.0 -jsonpath-ng>=1.5.2 diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index eaa69875..f8028ac2 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -83,46 +83,7 @@ async def _handle_discover(self, request): if all(pod_ip is not None for pod_ip in pod_ip_list): return web.json_response(pod_ip_list) return web.json_response(status=408) # Timeout. - - async def _handle_discover_gpu(self, request): - # Long-polling endpoint used for discoverin - # pod IPs and GPU for a given job. - namespace = request.match_info["namespace"] - name = request.match_info["name"] - group = request.match_info["group"] - timeout = int(request.query.get("timeout", "30")) - pod_ip_list = None - pod_gpu_list = None - async with kubernetes.watch.Watch() as w: - stream = w.stream(self._core_api.list_namespaced_pod, namespace, - label_selector="adaptdl/job={}".format(name), - field_selector="status.podIP!=", - timeout_seconds=timeout) - async for event in stream: - pod = event["object"] - replicas = int(pod.metadata.annotations["adaptdl/replicas"]) - rank = int(pod.metadata.annotations["adaptdl/rank"]) - if pod.metadata.annotations["adaptdl/group"] == group: - if pod_ip_list is None: - pod_ip_list = [None] * replicas - pod_ip_list[rank] = pod.status.pod_ip - if pod_gpu_list is None: - pod_gpu_list = [None] * replicas - container = pod.spec.containers - assert len(container) == 1 - pod_gpu_list[rank] = \ - int(container[0].resources.requests[ - 'nvidia.com/gpu']) - if all(pod_gpu is not None for pod_gpu in pod_gpu_list)\ - and all(pod_ip is not None - for pod_ip in pod_ip_list): - assert len(pod_ip_list) == len(pod_gpu_list) - return_list = [(pod_ip_list[i], pod_gpu_list[i]) - for i in range(len(pod_ip_list))] - LOG.info(return_list) - return web.json_response(return_list) - return web.json_response(status=408) # Timeout. - + async def _handle_report(self, request): namespace = request.match_info['namespace'] name = request.match_info['name'] From 5dcff5bacecb8e11ade32612bd8d1bf6449fa134 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 7 Dec 2020 03:35:22 +0000 Subject: [PATCH 6/9] lint --- sched/adaptdl_sched/supervisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index f8028ac2..c4198370 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -83,7 +83,7 @@ async def _handle_discover(self, request): if all(pod_ip is not None for pod_ip in pod_ip_list): return web.json_response(pod_ip_list) return web.json_response(status=408) # Timeout. - + async def _handle_report(self, request): namespace = request.match_info['namespace'] name = request.match_info['name'] From c2202214d8980699baee08ca9d9239cf5ff27fa6 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Mon, 7 Dec 2020 05:55:31 +0000 Subject: [PATCH 7/9] add exception --- sched/adaptdl_sched/supervisor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index c4198370..69edeeb6 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -63,7 +63,11 @@ async def _handle_discover(self, request): if pod_ip_list is None: pod_ip_list = [None] * replicas pod_ip_list[rank] = pod.status.pod_ip - if request.rel_url.query["gpu"]: + try: + gpu_request = request.rel_url.query["gpu"] + except KeyError: + gpu_request = False + if gpu_request: if pod_gpu_list is None: pod_gpu_list = [None] * replicas container = pod.spec.containers From f1465161e5f2f5f1c60eb8cb2bb22204a32404b7 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Fri, 1 Jan 2021 22:25:21 +0000 Subject: [PATCH 8/9] update dockerfile --- examples/autodist/Dockerfile | 23 ++++++++--------------- examples/autodist/adaptdljob.yaml | 5 ++++- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/examples/autodist/Dockerfile b/examples/autodist/Dockerfile index a7cb9c28..b4147a03 100644 --- a/examples/autodist/Dockerfile +++ b/examples/autodist/Dockerfile @@ -19,12 +19,6 @@ WORKDIR /root FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime FROM tensorflow/tensorflow:2.2.0-gpu - -# Set default shell to /bin/bash -# SHELL ["/bin/bash", "-cu"] - -# RUN rm -rf /etc/bash.bashrc - # Install apps COPY adaptdl adaptdl COPY examples/requirements.txt . @@ -36,9 +30,6 @@ RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl RUN pip install -r requirements.txt RUN rm -rf adaptdl/dist -COPY examples examples_adaptdl -#COPY examples examples -#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils # autodist env SHELL ["/bin/bash", "-cu"] @@ -59,16 +50,18 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ COPY bert_config.json bert_config.json COPY tf_examples.tfrecord tf_examples.tfrecord +RUN git clone https://github.com/petuum/autodist.git +#RUN cd autodist +#WORKDIR autodist +#RUN git checkout integration COPY autodist autodist -RUN cd autodist -RUN pip install tensorflow_hub RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip -COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip RUN unzip protoc-3.11.0-linux-x86_64.zip -RUN PROTOC=autodist/bin/protoc python autodist/setup.py build -WORKDIR autodist -RUN rm ./examples/resource_spec.yml +RUN PROTOC=$(pwd)/bin/protoc python setup.py build RUN pip install -e .[dev] +RUN pip install tensorflow_hub +WORKDIR autodist +#RUN rm ./examples/resource_spec.yml # setup ssh # Install OpenSSH to communicate between containers diff --git a/examples/autodist/adaptdljob.yaml b/examples/autodist/adaptdljob.yaml index ed364fa9..51212161 100644 --- a/examples/autodist/adaptdljob.yaml +++ b/examples/autodist/adaptdljob.yaml @@ -10,7 +10,7 @@ spec: - name: main command: - python3 - - /root/autodist/examples/benchmark/bert.py + - /root/autodist/examples/benchmark/bert_with_adaptdl.py - -input_files=/root/tf_examples.tfrecord - --bert_config_file=/root/bert_config.json - --num_train_epochs=1 @@ -18,6 +18,9 @@ spec: - --learning_rate=5e-5 - --steps_per_loop=1 - --autodist_strategy=PS + env: + - name: ADAPTDL + value: "true" resources: limits: nvidia.com/gpu: 1 From c012cde10ca23d1edffe84d6cd27d155a32b7a08 Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Sat, 2 Jan 2021 01:10:16 +0000 Subject: [PATCH 9/9] update dockerfile --- examples/autodist/Dockerfile | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/autodist/Dockerfile b/examples/autodist/Dockerfile index b4147a03..bec5574e 100644 --- a/examples/autodist/Dockerfile +++ b/examples/autodist/Dockerfile @@ -51,20 +51,16 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ COPY bert_config.json bert_config.json COPY tf_examples.tfrecord tf_examples.tfrecord RUN git clone https://github.com/petuum/autodist.git -#RUN cd autodist -#WORKDIR autodist -#RUN git checkout integration -COPY autodist autodist +WORKDIR autodist +RUN git checkout integration RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip RUN unzip protoc-3.11.0-linux-x86_64.zip RUN PROTOC=$(pwd)/bin/protoc python setup.py build RUN pip install -e .[dev] RUN pip install tensorflow_hub WORKDIR autodist -#RUN rm ./examples/resource_spec.yml # setup ssh -# Install OpenSSH to communicate between containers RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ mkdir -p /var/run/sshd