diff --git a/.gitignore b/.gitignore index 5dc488d..cfb90a4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ secrets/terraform-service-principal.sh terraform/.terraform/ terraform/*.tfstate terraform/*.tfstate.backup + +mpi-job-files/src/ +mpi-job-files/pkg/ +mpi-job-files/*.swp diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d1476e3 --- /dev/null +++ b/go.mod @@ -0,0 +1,49 @@ +module github.com/StatCan/openmpp + +go 1.21.6 + +require ( + github.com/kubeflow/training-operator v1.7.0 + k8s.io/api v0.29.1 + k8s.io/apimachinery v0.29.1 + k8s.io/client-go v0.29.1 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/go-logr/logr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/sirupsen/logrus v1.9.0 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/oauth2 v0.10.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/term v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + golang.org/x/time v0.3.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.31.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.110.1 // indirect + k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect + sigs.k8s.io/controller-runtime v0.15.0 // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..22e0d01 --- /dev/null +++ b/go.sum @@ -0,0 +1,162 @@ +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= +github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubeflow/training-operator v1.7.0 h1:Zh61GlOWrlRi4UFOtJeV+/5REo/OndhwQ25KYd0llzc= +github.com/kubeflow/training-operator v1.7.0/go.mod h1:BZCLX1h06wY3YSeSZZcGYAqI9/nVi7isVCRkfgZe9nE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= +github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= +github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= +github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= +golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= +golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.29.1 h1:DAjwWX/9YT7NQD4INu49ROJuZAAAP/Ijki48GUPzxqw= +k8s.io/api v0.29.1/go.mod h1:7Kl10vBRUXhnQQI8YR/R327zXC8eJ7887/+Ybta+RoQ= +k8s.io/apimachinery v0.29.1 h1:KY4/E6km/wLBguvCZv8cKTeOwwOBqFNjwJIdMkMbbRc= +k8s.io/apimachinery v0.29.1/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/client-go v0.29.1 h1:19B/+2NGEwnFLzt0uB5kNJnfTsbV8w6TgQRz9l7ti7A= +k8s.io/client-go v0.29.1/go.mod h1:TDG/psL9hdet0TI9mGyHJSgRkW3H9JZk2dNEUS7bRks= +k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= +k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.15.0 h1:ML+5Adt3qZnMSYxZ7gAverBLNPSMQEibtzAgp0UPojU= +sigs.k8s.io/controller-runtime v0.15.0/go.mod h1:7ngYvp1MLT+9GeZ+6lH3LOlcHkp/+tzA/fmHa4iq9kk= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/mpi-job-files/MPIJobTemplate.yaml b/mpi-job-files/MPIJobTemplate.yaml index 2081a3b..a5e1b9d 100644 --- a/mpi-job-files/MPIJobTemplate.yaml +++ b/mpi-job-files/MPIJobTemplate.yaml @@ -6,7 +6,8 @@ metadata: labels: notebook-name: # spec: - #slotsPerWorker: Tensorflow example sets this attribute, but Pat's example template doesn't. + # Experimenting with a hard-coded value of 2 that should fit our standard 2-core cluster nodes. + slotsPerWorker: 2 runPolicy: cleanPodPolicy: Running mpiReplicaSpecs: diff --git a/mpi-job-files/mpiJob/mpiJob b/mpi-job-files/mpiJob/mpiJob new file mode 100755 index 0000000..81516a9 Binary files /dev/null and b/mpi-job-files/mpiJob/mpiJob differ diff --git a/mpi-job-files/mpiJob/mpiJob.go b/mpi-job-files/mpiJob/mpiJob.go new file mode 100644 index 0000000..ca9643e --- /dev/null +++ b/mpi-job-files/mpiJob/mpiJob.go @@ -0,0 +1,449 @@ +// A handler for mpi-based run requests to be called by the openm web service application. +// Accesses the kubernetes api endpoint for mpijobs, requests new mpijob, relays log information from the +// launcher pod as the job is running, and responds to an abort request if it is relayed by the web service. + +package main + +import ( + "context" + "fmt" + "io" + "os" + "path" + //"reflect" + "strconv" + "strings" + "time" + + core "k8s.io/api/core/v1" + //"k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + kubeAPI "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeClient "github.com/kubeflow/training-operator/pkg/client/clientset/versioned/typed/kubeflow.org/v1" +) + +// Arguments that are currently passed via the kubeflow template: +// ModelName string // model name +// ExeStem string // base part of model exe name, usually modelName +// Dir string // work directory to run the model +// BinDir string // bin directory where model exe is located +// DbPath string // absolute path to sqlite database file: models/bin/model.sqlite +// MpiNp int // number of MPI processes +// HostFile string // if not empty then path to hostfile +// Args []string // model command line arguments +// Env map[string]string // environment variables to run the model + +func main() { + // Set hardcoded namespace and argument values representing + // an mpijob run request from the OpenM web service: + namespace := "jacek-dev" + modelName := "RiskPaths" + exeStem := "RiskPaths" + dir := "/home/jovyan/buckets/aaw-unclassified/microsim/models/bin" + binDir := "." + dbPath := "/home/jovyan/buckets/aaw-unclassified/microsim/models/bin/RiskPaths.sqlite" + var mpiNp int32 = 4 + args := []string{ + "-OpenM.RunStamp", + "2024_02_09_04_08_45_834", + "-OpenM.LogToConsole", + "true", + "-OpenM.LogRank", + "true", + "-OpenM.MessageLanguage", + "en-US", + "-OpenM.RunName", + "RiskPaths_Default_2024_02_08_23_08_10_047", + "-OpenM.SetName", + "Default", + "-OpenM.SubValues", + "32", + "-OpenM.Threads", + "16", + } + env := map[string]string{ + "SAMPLE_ENV": "VALUE", + } + + // Create an MPIJob object using the sample arguments defined above. + job := mpiJob(modelName, exeStem, dir, binDir, dbPath, mpiNp, args, env) + + // Validate spec using validation function. + err := kubeAPI.ValidateV1MpiJobSpec(&job.Spec) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("MPIJobSpec passed validation.") + } + + // Create in-cluster configuration object. + config, err := rest.InClusterConfig() + if err != nil { + panic(err.Error()) + } else { + fmt.Println("Cluster config object created.") + } + + // Obtain clientset with core resources. We will need it to access launcher pod logs. + clientSet, err := kubernetes.NewForConfig(config) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("Clientset obtained from config.") + } + + // Obtain interface to Pods collection for our namespace. + pods := clientSet.CoreV1().Pods(namespace) + + // Obtain pods collection Watch interface: + podsWatcher, err := pods.Watch(context.TODO(), meta.ListOptions{}) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("Obtained pods collection watch interface.") + } + + // Obtain reference to Pods collection event channel: + podsChan := podsWatcher.ResultChan() + + // Obtain client subset containing just the kubeflow based resources. + kubeClientSubset, err := kubeClient.NewForConfig(config) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("Kubeflow client subset obtained from config.") + } + + // Obtain interface to the MPIJobs collection for our namespace. + mpiJobs := kubeClientSubset.MPIJobs(namespace) + + // Obtain MPIJobs collection Watch interface. + mpiJobsWatcher, err := mpiJobs.Watch(context.TODO(), meta.ListOptions{}) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("Obtained mpiJobs collection watch interface.") + } + + // Obtain reference to MPIJobs collection event channel: + mpiJobsChan := mpiJobsWatcher.ResultChan() + + // Submit request to create MPIJob. It's confusing because an MPIJob is also passed as an argument. + // But the MPIJob being returned should have an active status, while the one being submitted will not. + _, err = mpiJobs.Create(context.TODO(), &job, meta.CreateOptions{}) + if err != nil { + panic(err.Error()) + } else { + fmt.Println("MPIJob was successfully submitted.") + } + + // Watch for events coming from MPIJobs collection and Pods collection: + var elapsedTime time.Duration + for { + select { + case podEvent, ok := <-podsChan: + if ok { + fmt.Println("Pod event...") + fmt.Println("EventType: ", podEvent.Type) + gvk := podEvent.Object.GetObjectKind().GroupVersionKind() + fmt.Println("Group: ", gvk.Group) + fmt.Println("Version: ", gvk.Version) + fmt.Println("Kind: ", gvk.Kind) + // Use reflection to determine what concrete type we're actually + // getting behind the runtime.Object interface, and how can we + // determine launcher pod status from it. + fmt.Println("") + } else { + fmt.Println("podsChannel is closed.") + } + case mpiJobEvent, ok := <-mpiJobsChan: + if ok { + fmt.Println("MPIJob event...") + fmt.Println("EventType: ", mpiJobEvent.Type) + gvk := mpiJobEvent.Object.GetObjectKind().GroupVersionKind() + fmt.Println("Group: ", gvk.Group) + fmt.Println("Version: ", gvk.Version) + fmt.Println("Kind: ", gvk.Kind) + // Same thing, use reflection to figure out how to get mpijob + // status info from the concrete type behind runtime.Object. + fmt.Println("") + } else { + fmt.Println("mpiJobsChannel is closed.") + } + default: + fmt.Println("Elapsed time: ", elapsedTime) + time.Sleep(2 * time.Second) // Elapsed time is not quite correct because it ignores the time + elapsedTime += 2 // elapsed when the channel reads are happening. + + // Break out after some reasonable time limit, at least for now when we're testing. + if elapsedTime > (360 * time.Second) { + break + } + } + } + + // Commenting out and probably removing, as it's not necessary to list unrelated mpijobs in output. + // results, err := mpiJobs.List(context.TODO(), meta.ListOptions{}) + // if err != nil { + // panic(err.Error()) + // } else { + // fmt.Println("MPIJobs collection obtained.") + // } + + // Display status of mpi jobs in collection. + // for _, r := range results.Items { + // fmt.Println("Job name: ", r.ObjectMeta.Name) + // for _, c := range r.Status.Conditions { + // fmt.Println("Condition type:", c.Type, " ... ", "Status", c.Status) + // fmt.Println("") + // } + // } + + // Obtain launcher pod name from mpijob template. It defaults to name of main container in launcher pod. + name := job.Spec.MPIReplicaSpecs["Launcher"].Template.Spec.Containers[0].Name + + // Confirm it's returning the correct pod name: + fmt.Println("Launcher pod name: ", name) + + // CoreV1() returns a CoreV1Interface instance. + // Pods(namespace) returns a PodInterface instance. + // Both are defined in: client-go/kubernetes/core/v1. + // Pod type is defined in: k8s.io/api/core/v1. + + // launcherPod has Status field of type PodStatus. + // PodStatus includes fields: Phase PodPhase, ContainerStatuses []ContainerStatus + // Poll launcher pod status until it's Running or in a terminal state or until it times out. + + // Commenting out because it's bugging out currently, and we want to test if we can get + // the log stream from the launcher pod once its up. We'll hard-code a wait instead for now. + // var launcherPod *core.Pod + // elapsedTime := 0 + // for { + // // Use PodInterface Get method to obtain Pod object representing the launcher pod. + // launcherPod, err := clientSet.CoreV1().Pods(namespace).Get(context.TODO(), name, meta.GetOptions{}) + + // // If pod object is returned check its status. + // if err == nil { + // phase := launcherPod.Status.Phase + // fmt.Println("PodPhase: ", phase) + // if phase == core.PodRunning || phase == core.PodSucceeded { + // break + // } else if phase == core.PodFailed || phase == core.PodPending && elapsedTime > 300 { + // panic(err.Error()) + // } + // } else if elapsedTime < 300 { + // time.Sleep(2 * time.Second) + // elapsedTime += 2 + // } else { + // panic(err.Error()) + // } + // } + + // Ok, hard-coding a sleep interval worked. When we access the pod now it's available. + // time.Sleep(25 * time.Second) + // launcherPod, err := clientSet.CoreV1().Pods(namespace).Get(context.TODO(), name, meta.GetOptions{}) + // if err != nil { + // panic(err.Error()) + // } + + // Print PodStatus.Phase one last time: + // fmt.Println("Launcher pod phase: ", launcherPod.Status.Phase) + + // Once launcher pod is running hook into its logs using a rest.Request instance: + req := clientSet.CoreV1().Pods(namespace).GetLogs(name, &core.PodLogOptions{}) + + // I want to check what type req is. + // tp := reflect.TypeOf(req) + // fmt.Println("Type of req: ", tp, " Name: ", tp.Name()) + + // podLogs is of type io.ReadCloser. + // It implements the Reader and Closer interfaces in the standard library. + podLogs, err := req.Stream(context.TODO()) + if err != nil { + panic(err.Error()) + } + defer podLogs.Close() + + // Route launcher pod log stream to standard output. + _, err = io.Copy(os.Stdout, podLogs) + if err != nil { + panic(err.Error()) + } + + // Use os.Exit to terminate the program and return a status code. + // In cases of error we will be able to send an error status code that the web service should pick up. + os.Exit(0) +} + +// Generate mpijob object based on arguments coming from openm web service and cluster configuration. +func mpiJob(modelName, exeStem, dir, binDir, dbPath string, mpiNp int32, args []string, env map[string]string) kubeAPI.MPIJob { + + // Start off by constructing constituent parts from the bottom up: + timeStamp := strconv.FormatInt(time.Now().UnixNano(), 10) + containerImage := "k8scc01covidacr.azurecr.io/ompp-run-ubuntu:0f80cf47bb5d6f6e025990e511b72be614bbcf7c" + + //Need to join modelName with binDir and append _mpi: + modelExecutable := strings.Join([]string{path.Join(binDir, modelName), "mpi"}, "_") + + // Entrypoint command for containers: + containerCommand := []string{"mpirun", "/bin/bash"} + + // Append the ulimit setting, fully qualified model exec filename, and all OpenM options: + bashArguments := append([]string{"ulimit -s 63356 &&", modelExecutable}, args...) + + // I think any occurrences of environment variables (entries prefixed by $) in the container + // arguments field are automatically resolved to the values these environment variables hold + // in the given execution context. That's why there are separate fields for command and + // arguments in the container type. + containerArguments := append([]string{"-c"}, strings.Join(bashArguments, " ")) + + // CPU resource limits and requests: + cpuResourceLimit := resource.NewMilliQuantity(2000, resource.BinarySI) + cpuLauncherRequest := resource.NewMilliQuantity(250, resource.BinarySI) + cpuWorkerRequest := resource.NewMilliQuantity(2000, resource.BinarySI) + + // Memory resource limits and requests: + memoryLimit := resource.NewQuantity(2*1024*1024*1024, resource.BinarySI) + memoryLauncherRequest := resource.NewQuantity(250*1024*1024, resource.BinarySI) + memoryWorkerRequest := resource.NewQuantity(1024*1024*1024, resource.BinarySI) + + // Node resource limits: + resourceLimits := core.ResourceList{ + core.ResourceCPU: *cpuResourceLimit, + core.ResourceMemory: *memoryLimit, + } + + // Launcher resource requests: + launcherResourceRequests := core.ResourceList{ + core.ResourceCPU: *cpuLauncherRequest, + core.ResourceMemory: *memoryLauncherRequest, + } + + // Worker core requests: + workerResourceRequests := core.ResourceList{ + core.ResourceCPU: *cpuWorkerRequest, + core.ResourceMemory: *memoryWorkerRequest, + } + + // Launcher resource requirements: + launcherResourceRequirements := core.ResourceRequirements{ + Limits: resourceLimits, + Requests: launcherResourceRequests, + } + + // Worker resource requirements: + workerResourceRequirements := core.ResourceRequirements{ + Limits: resourceLimits, + Requests: workerResourceRequests, + } + + // Launcher container spec: + mainContainerName := strings.Join([]string{strings.ToLower(modelName), timeStamp, "launcher"}, "-") + launcherContainer := core.Container{ + Name: mainContainerName, + Image: containerImage, + Command: containerCommand, + Args: containerArguments, + Resources: launcherResourceRequirements, + } + + // Worker container spec: + workerContainer := core.Container{ + Name: strings.Join([]string{strings.ToLower(modelName), timeStamp, "worker"}, "-"), + Image: containerImage, + Resources: workerResourceRequirements, + } + + // Launcher pod specs: + launcherPodSpec := core.PodSpec{ + Containers: []core.Container{launcherContainer}, + } + + // Worker pod specs: + workerPodSpec := core.PodSpec{ + Containers: []core.Container{workerContainer}, + } + + // Labels for worker and launcher pods: + labels := map[string]string{ + "data.statcan.gc.ca/inject-blob-volumes": "true", + "sidecar.istio.io/inject": "false", + } + + podObjectMetadata := meta.ObjectMeta{ + Labels: labels, + } + + // Pod template specifications: + launcherPodTemplateSpec := core.PodTemplateSpec{ + ObjectMeta: podObjectMetadata, + Spec: launcherPodSpec, + } + + workerPodTemplateSpec := core.PodTemplateSpec{ + ObjectMeta: podObjectMetadata, + Spec: workerPodSpec, + } + + // Launcher and worker replica specs: + var one int32 = 1 + launcherReplicaSpec := kubeAPI.ReplicaSpec{ + Replicas: &one, + Template: launcherPodTemplateSpec, + } + + workerReplicaSpec := kubeAPI.ReplicaSpec{ + Replicas: &mpiNp, + Template: workerPodTemplateSpec, + } + + // MPIJobSpec: + var two int32 = 2 + var cleanPodPolicy kubeAPI.CleanPodPolicy = kubeAPI.CleanPodPolicyRunning + mpiJobSpec := kubeAPI.MPIJobSpec{ + // 2 slots conforms to typical worker nodes used in the aaw cluster. + SlotsPerWorker: &two, + + // This one is a map from the set of replica types {Launcher, Worker, ... } to *ReplicaSpecs. + // We don't need to provide ReplicaSpecs for any replica types other than Launcher and Worker. + // The reason they have these other ones is for specifying different types of distributed workloads. + MPIReplicaSpecs: map[kubeAPI.ReplicaType]*kubeAPI.ReplicaSpec{ + "Launcher": &launcherReplicaSpec, + "Worker": &workerReplicaSpec, + }, + + MainContainer: mainContainerName, + CleanPodPolicy: &cleanPodPolicy, + } + + // Get hostname to annotate to MPIJob object. + data, err := os.ReadFile("/etc/hostname") + if err != nil { + panic(err.Error()) + } + hostname := strings.Replace(string(data), "\n", "", -1) + + // Type and object metadata: + tm := meta.TypeMeta{ + Kind: "MPIJob", + APIVersion: "kubeflow.org/v1", + } + om := meta.ObjectMeta{ + Name: strings.Join([]string{strings.ToLower(modelName), timeStamp}, "-"), + Labels: map[string]string{ + "notebook-name": hostname, + }, + } + + // Construct MPIJob object: + job := kubeAPI.MPIJob{ + TypeMeta: tm, + ObjectMeta: om, + Spec: mpiJobSpec, + } + return job +} diff --git a/mpi-job-files/mpiJob/temp.go b/mpi-job-files/mpiJob/temp.go new file mode 100644 index 0000000..bdfd227 --- /dev/null +++ b/mpi-job-files/mpiJob/temp.go @@ -0,0 +1,209 @@ +// Moved type and function declarations that are not being used into this separate file because +// go compiler throws errors when building a package with unused declarations. + +// But we could try this suggestion also. This is a function with any empty body that accepts +// any number of arguments of any type. Invoking this function once in our main() and passing +// it all the unused variables takes care of all the unused variable build errors. +package temp + +func UNUSED(x ...interface{}) {} + +// Modify this to conform to our use case. We'll have already created a clientset. +// And we will want to use a label selector or name to locate the launcher pod for a given MPIjob. +func getPodLogs(pod core.Pod) string { + podLogOpts := core.PodLogOptions{} + config, err := rest.InClusterConfig() + if err != nil { + return "error in getting config" + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return "error in getting access to K8S" + } + req := clientset.Core().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) + podLogs, err := req.Stream() + if err != nil { + return "error in opening stream" + } + defer podLogs.Close() + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, podLogs) + if err != nil { + return "error in copy information from podLogs to buf" + } +} + +// Might want to define a structure for the arguments coming from openm web service as that will +// let us provide default values for some fields, and refactor mpiJobSpec below to accept the +// structure instead. +type mpiJobArgs struct { + modelName string + exeStem string + dir string + binDir string + dbPath string + mpiNp int32 + args []string + env map[string]string +} + +// Function based on go-client example code that queries the kubernets api for pod info: +func showPods(cs *kubernetes.Clientset, namespace string) { + // Specify namespace to get pods in particular namespace + // or omit parameter to search all namespaces. + pods, err := cs.CoreV1().Pods(namespace).List(context.TODO(), meta.ListOptions{}) + if err != nil { + panic(err.Error()) + } + fmt.Printf("There are %d pods in the cluster\n", len(pods.Items)) + + // Use some reflection functions to explore the pods variable: + + // Examples for error handling: + // - Use helper functions e.g. errors.IsNotFound() + // - And/or cast to StatusError and use its properties like e.g. ErrStatus.Message + _, err = cs.CoreV1().Pods("default").Get(context.TODO(), "example-xxxxx", meta.GetOptions{}) + if errors.IsNotFound(err) { + fmt.Printf("Pod example-xxxxx not found in default namespace\n") + } else if statusError, isStatus := err.(*errors.StatusError); isStatus { + fmt.Printf("Error getting pod %v\n", statusError.ErrStatus.Message) + } else if err != nil { + panic(err.Error()) + } else { + fmt.Printf("Found example-xxxxx pod in default namespace\n") + } +} + +// Keeping the reflection code here for now, but probably won't need it. +func discoverApis(cs *kubernetes.Clientset, namespace string) { + // Create the meta-variable to examine the clientset variable. + cst := reflect.TypeOf(cs) + fmt.Println("The underlying type of cs: ", cst.Name()) + + for i := 0; i < cst.NumMethod(); i++ { + method := cst.Method(i) + name := method.Name + tp := method.Type + fmt.Println("Name: ", name) + fmt.Println("Type: ", tp) + fmt.Println() + } + + // Try to call several of the most promising ones and repeat the method discovery process: + extensionsV1beta1 := cs.ExtensionsV1beta1() + extensionsV1beta1Type := reflect.TypeOf(extensionsV1beta1) + fmt.Println("Method set of the extensionsV1beta1 client subset:") + for i := 0; i < extensionsV1beta1Type.NumMethod(); i++ { + method := extensionsV1beta1Type.Method(i) + name := method.Name + tp := method.Type + fmt.Println("Name: ", name) + fmt.Println("Type: ", tp) + fmt.Println() + } + + restClient := cs.RESTClient() + restClientType := reflect.TypeOf(restClient) + fmt.Println("Method set of the RESTClient subset:") + for i := 0; i < restClientType.NumMethod(); i++ { + method := restClientType.Method(i) + name := method.Name + tp := method.Type + fmt.Println("Name: ", name) + fmt.Println("Type: ", tp) + fmt.Println() + } + + discoveryV1beta1 := cs.DiscoveryV1beta1() + discoveryV1beta1Type := reflect.TypeOf(discoveryV1beta1) + fmt.Println("Method set of the discoveryV1beta1 client subset:") + for i := 0; i < discoveryV1beta1Type.NumMethod(); i++ { + method := discoveryV1beta1Type.Method(i) + name := method.Name + tp := method.Type + fmt.Println("Name: ", name) + fmt.Println("Type: ", tp) + fmt.Println() + } +} + +type MPIJobList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []MPIJob `json:"items"` + +type MPIJob struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec MPIJobSpec `json:"spec,omitempty"` + Status JobStatus `json:"status,omitempty"` + +type JobStatus struct { + // Conditions is an array of current observed job conditions. + Conditions []JobCondition `json:"conditions,omitempty"` + + // ReplicaStatuses is map of ReplicaType and ReplicaStatus, + // specifies the status of each replica. + ReplicaStatuses map[ReplicaType]*ReplicaStatus `json:"replicaStatuses,omitempty"` + + // Represents time when the job was acknowledged by the job controller. + // It is not guaranteed to be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + StartTime *metav1.Time `json:"startTime,omitempty"` + + // Represents time when the job was completed. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + CompletionTime *metav1.Time `json:"completionTime,omitempty"` + + // Represents last time when the job was reconciled. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"` + +type JobCondition struct { + // Type of job condition. + Type JobConditionType `json:"type"` + // Status of the condition, one of True, False, Unknown. + Status v1.ConditionStatus `json:"status"` + // The reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // A human readable message indicating details about the transition. + Message string `json:"message,omitempty"` + // The last time this condition was updated. + LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` + // Last time the condition transitioned from one status to another. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + +const ( + // JobCreated means the job has been accepted by the system, + // but one or more of the pods/services has not been started. + // This includes time before pods being scheduled and launched. + JobCreated JobConditionType = "Created" + + // JobRunning means all sub-resources (e.g. services/pods) of this job + // have been successfully scheduled and launched. + // The training is running without error. + JobRunning JobConditionType = "Running" + + // JobRestarting means one or more sub-resources (e.g. services/pods) of this job + // reached phase failed but maybe restarted according to it's restart policy + // which specified by user in v1.PodTemplateSpec. + // The training is freezing/pending. + JobRestarting JobConditionType = "Restarting" + + // JobSucceeded means all sub-resources (e.g. services/pods) of this job + // reached phase have terminated in success. + // The training is complete without error. + JobSucceeded JobConditionType = "Succeeded" + + // JobSuspended means the job has been suspended. + JobSuspended JobConditionType = "Suspended" + + // JobFailed means one or more sub-resources (e.g. services/pods) of this job + // reached phase failed with no restarting. + // The training has failed its execution. + JobFailed JobConditionType = "Failed" +)