From 86dccfa4401a265f06ac6e389eae3558cbca4f3e Mon Sep 17 00:00:00 2001 From: lx1036 Date: Tue, 13 Jan 2026 17:26:00 +0800 Subject: [PATCH] fix issue79: Add support for golang 1.24 and dgx-spark GB10 GPU Signed-off-by: lx1036 --- docker/amd64/Dockerfile.centos7 | 5 +- docker/amd64/Dockerfile.ubuntu20.04 | 5 +- go.mod | 44 +++++- go.sum | 25 +--- pkg/plugin/nvidia/helper.go | 32 ++++ pkg/plugin/nvidia/host_memory.go | 54 +++++++ pkg/plugin/nvidia/nvidia.go | 218 +++++++++++++++++++++------- pkg/plugin/nvidia/server.go | 11 +- pkg/plugin/nvidia/utils.go | 48 ++++-- 9 files changed, 345 insertions(+), 97 deletions(-) create mode 100644 pkg/plugin/nvidia/helper.go create mode 100644 pkg/plugin/nvidia/host_memory.go diff --git a/docker/amd64/Dockerfile.centos7 b/docker/amd64/Dockerfile.centos7 index 7425d526f..4cc79b41f 100644 --- a/docker/amd64/Dockerfile.centos7 +++ b/docker/amd64/Dockerfile.centos7 @@ -20,9 +20,8 @@ RUN yum install -y \ wget && \ rm -rf /var/cache/yum/* -ENV GOLANG_VERSION 1.17.6 -RUN wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-amd64.tar.gz \ - | tar -C /usr/local -xz +ARG TARGETARCH +RUN wget -qO- https://storage.googleapis.com/golang/go1.23.7.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx ENV GOPATH /go ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH diff --git a/docker/amd64/Dockerfile.ubuntu20.04 b/docker/amd64/Dockerfile.ubuntu20.04 index c9c15e985..503681a0f 100644 --- a/docker/amd64/Dockerfile.ubuntu20.04 +++ b/docker/amd64/Dockerfile.ubuntu20.04 @@ -20,9 +20,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget && \ rm -rf /var/lib/apt/lists/* -ENV GOLANG_VERSION 1.19.3 -RUN wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-amd64.tar.gz \ - | tar -C /usr/local -xz +ARG TARGETARCH +RUN wget -qO- https://storage.googleapis.com/golang/go1.23.7.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx ENV GOPATH /go ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH diff --git a/go.mod b/go.mod index 4a3a7d389..210c7751b 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module volcano.sh/k8s-device-plugin -go 1.14 +go 1.23 replace ( k8s.io/api => k8s.io/api v0.18.2 @@ -28,13 +28,11 @@ replace ( ) require ( - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200421213100-de959f43b55a + github.com/NVIDIA/go-nvml v0.12.4-1 github.com/fsnotify/fsnotify v1.4.9 - github.com/mitchellh/gox v1.0.1 // indirect github.com/prometheus/common v0.4.1 - github.com/stretchr/testify v1.5.1 + github.com/stretchr/testify v1.9.0 github.com/urfave/cli/v2 v2.4.0 - golang.org/x/net v0.0.0-20200421231249-e086a090c8fd // indirect google.golang.org/grpc v1.29.0 k8s.io/api v0.18.2 k8s.io/apimachinery v0.18.2 @@ -45,3 +43,39 @@ require ( k8s.io/kubernetes v1.18.2 sigs.k8s.io/yaml v1.2.0 ) + +require ( + github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect + github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.1 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/go-logr/logr v1.2.0 // indirect + github.com/gogo/protobuf v1.3.1 // indirect + github.com/golang/protobuf v1.3.3 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/googleapis/gnostic v0.1.0 // indirect + github.com/imdario/mergo v0.3.5 // indirect + github.com/json-iterator/go v1.1.8 // indirect + github.com/konsorten/go-windows-terminal-sequences v1.0.1 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/sirupsen/logrus v1.4.2 // indirect + github.com/spf13/pflag v1.0.5 // indirect + golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 // indirect + golang.org/x/net v0.0.0-20200421231249-e086a090c8fd // indirect + golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 // indirect + golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 // indirect + golang.org/x/text v0.3.2 // indirect + golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 // indirect + google.golang.org/appengine v1.5.0 // indirect + google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55 // indirect + gopkg.in/alecthomas/kingpin.v2 v2.2.6 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.2.8 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/kube-openapi v0.0.0-20200121204235-bf4fb3bd569c // indirect + k8s.io/utils v0.0.0-20200324210504-a9aa75ae1b89 // indirect + sigs.k8s.io/structured-merge-diff/v3 v3.0.0 // indirect +) diff --git a/go.sum b/go.sum index 27b5a88d8..422b9c39e 100644 --- a/go.sum +++ b/go.sum @@ -20,8 +20,8 @@ github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5 github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd/go.mod h1:64YHyfSL2R96J44Nlwm39UHepQbyR5q10x7iYa1ks2E= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200421213100-de959f43b55a h1:DploSoAcQ8tcaEjGnaPGt0I33v/dsc53Xam+OsOx3X8= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200421213100-de959f43b55a/go.mod h1:l0Cq257MSJMvg9URCXUjc8pgKY2SK1oSvIx6qG0bzzc= +github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= +github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OpenPeeDeeP/depguard v1.0.0/go.mod h1:7/4sitnI9YlQgTLLk734QlzXT8DuHVnAyztLplQjk+o= github.com/OpenPeeDeeP/depguard v1.0.1/go.mod h1:xsIw86fROiiwelg+jB2uM9PiKihMMmUx/1V+TNhjQvM= @@ -87,9 +87,7 @@ github.com/coreos/go-systemd v0.0.0-20181012123002-c6f51f82210d/go.mod h1:F5haX7 github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/coreos/pkg v0.0.0-20180108230652-97fdf19511ea/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= -github.com/cpuguy83/go-md2man v1.0.10 h1:BSKMNlYxDvnunlTymqtgONjNnaRV1sTpcovwwjF22jk= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= -github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.1 h1:r/myEWzV9lfsM1tFLgDyu0atFtJ1fXn261LKYj/3DxU= github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= @@ -263,7 +261,6 @@ github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEo github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= -github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gostaticanalysis/analysisutil v0.0.0-20190318220348-4088753ea4d3/go.mod h1:eEOZF4jCKGi+aprrirO9e7WKB3beBRtWgqGunKl6pKE= @@ -273,8 +270,6 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= -github.com/hashicorp/go-version v1.0.0 h1:21MVWPKDphxa7ineQQTrCU5brh7OuVVAzGOCnnCPtE8= -github.com/hashicorp/go-version v1.0.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v0.0.0-20180201235237-0fb14efe8c47/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -356,10 +351,6 @@ github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrk github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-ps v0.0.0-20170309133038-4fdf99ab2936/go.mod h1:r1VsdOzOPt1ZSrGZWFoNhsAedKnEd6r9Np1+5blZCWk= github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= -github.com/mitchellh/gox v1.0.1 h1:x0jD3dcHk9a9xPSDN6YEL4xL6Qz0dvNYm8yZqui5chI= -github.com/mitchellh/gox v1.0.1/go.mod h1:ED6BioOGXMswlXa2zxfh/xdd5QhwYliBFn9V18Ap4z4= -github.com/mitchellh/iochan v1.0.0 h1:C+X3KsSTLFVBr/tK1eYN/vs4rJcvsiLU338UhYPJWeY= -github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= github.com/mitchellh/mapstructure v0.0.0-20180220230111-00c29f56e238/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -427,9 +418,7 @@ github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rubiojr/go-vhd v0.0.0-20160810183302-0bfd3b39853c/go.mod h1:DM5xW0nvfNNm2uytzsvhI3OnX8uzaRAg8UX/CnDqbto= github.com/russross/blackfriday v0.0.0-20170610170232-067529f716f4/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= -github.com/russross/blackfriday v1.5.2 h1:HyvC0ARfnZBqnXwABFeSZHpKvJHJJfPz81GNueLj0oo= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/go-glob v0.0.0-20170128012129-256dc444b735/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= @@ -440,7 +429,6 @@ github.com/shirou/gopsutil v0.0.0-20180427012116-c95755e4bcd7/go.mod h1:5b4v6he4 github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= github.com/shurcooL/go v0.0.0-20180423040247-9e1955d9fb6e/go.mod h1:TDJrrUr11Vxrven61rcy3hJMUqaf/CLWYhHNPmT14Lk= github.com/shurcooL/go-goon v0.0.0-20170922171312-37c2f522c041/go.mod h1:N5mDOmsrJOB+vfqUK+7DmDyjhSLIIBnXo9lvZJj3MWQ= -github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.0.5/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= @@ -476,8 +464,8 @@ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoH github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= github.com/thecodeteam/goscaleio v0.1.0/go.mod h1:68sdkZAsK8bvEwBlbQnlLS+xU+hvLYM/iQ8KXej1AwM= @@ -487,9 +475,7 @@ github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/ultraware/funlen v0.0.1/go.mod h1:Dp4UiAus7Wdb9KUZsYWZEWiRzGuM2kXM1lPbfaF6xhA= github.com/ultraware/funlen v0.0.2/go.mod h1:Dp4UiAus7Wdb9KUZsYWZEWiRzGuM2kXM1lPbfaF6xhA= -github.com/urfave/cli v1.20.0 h1:fDqGv3UG/4jbVl/QkFwEdddtEDjh/5Ov6X+0B/3bPaw= github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ= github.com/urfave/cli/v2 v2.4.0 h1:m2pxjjDFgDxSPtO8WSdbndj17Wu2y8vOT86wE/tjr+I= github.com/urfave/cli/v2 v2.4.0/go.mod h1:NX9W0zmTvedE5oDoOMs2RTC8RvdK98NTYZE5LbaEYPg= github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= @@ -673,7 +659,6 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.28.1/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= google.golang.org/grpc v1.29.0 h1:2pJjwYOdkZ9HlN4sWRYBg9ttH5bCOlsueaM+b/oYjwo= google.golang.org/grpc v1.29.0/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U= @@ -703,6 +688,8 @@ gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.1.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/gotestsum v0.3.5/go.mod h1:Mnf3e5FUzXbkCfynWBGOwLssY7gTQgCHObK9tMpAriY= diff --git a/pkg/plugin/nvidia/helper.go b/pkg/plugin/nvidia/helper.go new file mode 100644 index 000000000..15e9efc9e --- /dev/null +++ b/pkg/plugin/nvidia/helper.go @@ -0,0 +1,32 @@ +/* +Copyright 2025 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +// int8Slice wraps an []int8 with more functions. +type int8Slice []int8 + +// String turns a nil terminated int8Slice into a string +func (s int8Slice) String() string { + var b []byte + for _, c := range s { + if c == 0 { + break + } + b = append(b, byte(c)) + } + return string(b) +} diff --git a/pkg/plugin/nvidia/host_memory.go b/pkg/plugin/nvidia/host_memory.go new file mode 100644 index 000000000..d6fc4ce57 --- /dev/null +++ b/pkg/plugin/nvidia/host_memory.go @@ -0,0 +1,54 @@ +package nvidia + +import ( + "bytes" + "fmt" + "os/exec" + "strconv" + "strings" + + "k8s.io/klog/v2" +) + +// GetHostMemory runs `free -b` and parses total/used/free (in bytes), +// supporting both English and localized (e.g. Chinese) output. +func GetHostMemory() (uint64, error) { + cmd := exec.Command("free", "-b") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + + if err := cmd.Run(); err != nil { + return 0, fmt.Errorf("failed to run 'free -b': %v, output: %s", err, out.String()) + } + + lines := strings.Split(strings.TrimSpace(out.String()), "\n") + if len(lines) < 2 { + return 0, fmt.Errorf("unexpected 'free' output: %s", out.String()) + } + + // find the first line that looks like data (numeric second field) + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + // skip header lines (contain non-digit in second field) + if _, err := strconv.ParseUint(strings.ReplaceAll(fields[1], ",", ""), 10, 64); err != nil { + continue + } + + // found the data line + total, err1 := strconv.ParseUint(fields[1], 10, 64) + used, err2 := strconv.ParseUint(fields[2], 10, 64) + free, err3 := strconv.ParseUint(fields[3], 10, 64) + if err1 != nil || err2 != nil || err3 != nil { + return 0, fmt.Errorf("parse error: %v %v %v", err1, err2, err3) + } + + klog.Infof("get system memory total: %dGB, used: %dGB, free: %dGB", total/1024/1024/1024, used/1024/1024/1024, free/1024/1024/1024) + return total, nil + } + + return 0, fmt.Errorf("no memory data line found in 'free' output") +} diff --git a/pkg/plugin/nvidia/nvidia.go b/pkg/plugin/nvidia/nvidia.go index 30ad92d86..3e64d7a1b 100644 --- a/pkg/plugin/nvidia/nvidia.go +++ b/pkg/plugin/nvidia/nvidia.go @@ -17,11 +17,14 @@ package nvidia import ( + "bytes" + "fmt" "log" "os" + "strconv" "strings" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml" "k8s.io/klog" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" @@ -46,25 +49,24 @@ type ResourceManager interface { type GpuDeviceManager struct{} -func check(err error) { - if err != nil { - log.Panicln("Fatal:", err) - } -} - func NewGpuDeviceManager() *GpuDeviceManager { return &GpuDeviceManager{} } func (g *GpuDeviceManager) Devices() []*Device { - n, err := nvml.GetDeviceCount() - check(err) + count, ret := nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get device count: %s", nvmllib.ErrorString(ret)) + } var devs []*Device - for i := uint(0); i < n; i++ { - d, err := nvml.NewDeviceLite(i) - check(err) - devs = append(devs, buildDevice(d, i)) + for i := 0; i < count; i++ { + device, ret := nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get device handle by index %d: %s", i, nvmllib.ErrorString(ret)) + } + + devs = append(devs, buildDevice(device, i)) } return devs @@ -83,25 +85,67 @@ func (g *GpuDeviceManager) CheckHealth(stop <-chan struct{}, devices []*Device, checkHealth(stop, devices, unhealthy) } -func buildDevice(d *nvml.Device, devIndex uint) *Device { +func buildDevice(d nvml.Device, devIndex int) *Device { + uuid, ret := nvmllib.DeviceGetUUID(d) + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get UUID of device: %s", nvmllib.ErrorString(ret)) + } + + minor, ret := nvmllib.DeviceGetMinorNumber(d) + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get minor number of device: %s", nvmllib.ErrorString(ret)) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + dev := Device{} - dev.ID = d.UUID + dev.ID = uuid dev.Health = pluginapi.Healthy - dev.Path = d.Path - dev.Index = devIndex + dev.Path = path + dev.Index = uint(devIndex) + hasNuma, numa, err := getNumaNode(d) + if err != nil { + klog.Fatalf("Failed to get device NUMA node: %v", err) + } - if d.CPUAffinity != nil { + if hasNuma { dev.Topology = &pluginapi.TopologyInfo{ Nodes: []*pluginapi.NUMANode{ - &pluginapi.NUMANode{ - ID: int64(*(d.CPUAffinity)), + { + ID: int64(numa), }, }, } } + return &dev } +func getNumaNode(d nvml.Device) (bool, int, error) { + pciInfo, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return false, 0, fmt.Errorf("error getting PCI Bus Info of device: %v", ret) + } + + // Discard leading zeros. + busID := strings.ToLower(strings.TrimPrefix(int8Slice(pciInfo.BusId[:]).String(), "0000")) + + b, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", busID)) + if err != nil { + return false, 0, nil + } + + node, err := strconv.Atoi(string(bytes.TrimSpace(b))) + if err != nil { + return false, 0, fmt.Errorf("eror parsing value for NUMA node: %v", err) + } + + if node < 0 { + return false, 0, nil + } + + return true, node, nil +} + func checkHealth(stop <-chan struct{}, devices []*Device, unhealthy chan<- *Device) { disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) if disableHealthChecks == "all" { @@ -111,23 +155,60 @@ func checkHealth(stop <-chan struct{}, devices []*Device, unhealthy chan<- *Devi return } - eventSet := nvml.NewEventSet() - defer nvml.DeleteEventSet(eventSet) + // FIXME: formalize the full list and document it. + // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 + // Application errors: the GPU should still be healthy + applicationErrorXids := []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + } + + skippedXids := make(map[uint64]bool) + for _, id := range applicationErrorXids { + skippedXids[id] = true + } + + for _, additionalXid := range getAdditionalXids(disableHealthChecks) { + skippedXids[additionalXid] = true + } + eventSet, ret := nvmllib.EventSetCreate() + if ret != nvml.SUCCESS { + klog.Warningf("could not create event set: %v", ret) + return + } + defer eventSet.Free() + + eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) + parentToDeviceMap := make(map[string]*Device) for _, d := range devices { - err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, d.ID) - if err != nil && strings.HasSuffix(err.Error(), "Not Supported") { - log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", d.ID, err) + parentToDeviceMap[d.ID] = d + + gpu, ret := nvmllib.DeviceGetHandleByUUID(d.ID) + if ret != nvml.SUCCESS { + klog.Infof("unable to get device handle from UUID: %v; marking it as unhealthy", ret) unhealthy <- d continue } - check(err) - } - firstTime := true - ki, err := NewKubeInteractor() - if err != nil { - klog.Fatalf("cannot create kube interactor. %v", err) + supportedEvents, ret := gpu.GetSupportedEventTypes() + if ret != nvml.SUCCESS { + klog.Infof("Unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) + unhealthy <- d + continue + } + + ret = gpu.RegisterEvents(eventMask&supportedEvents, eventSet) + if ret == nvml.ERROR_NOT_SUPPORTED { + klog.Warningf("Device %v is too old to support healthchecking.", d.ID) + } + if ret != nvml.SUCCESS { + klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret) + unhealthy <- d + } } for { @@ -137,42 +218,71 @@ func checkHealth(stop <-chan struct{}, devices []*Device, unhealthy chan<- *Devi default: } - e, err := nvml.WaitForEvent(eventSet, 5000) - if err != nil && e.Etype != nvml.XidCriticalError { - if firstTime { - // reset unhealthy gpu list if all devices healthy - ki.PatchUnhealthyGPUListOnNode(devices) - firstTime = false + e, ret := eventSet.Wait(5000) + if ret == nvml.ERROR_TIMEOUT { + continue + } + if ret != nvml.SUCCESS { + klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret) + for _, d := range devices { + unhealthy <- d } continue } - // FIXME: formalize the full list and document it. - // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 - // Application errors: the GPU should still be healthy - if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 { - if firstTime { - // reset unhealthy gpu list if all devices healthy - ki.PatchUnhealthyGPUListOnNode(devices) - firstTime = false - } + if e.EventType != nvml.EventTypeXidCriticalError { + klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e) continue } - if e.UUID == nil || len(*e.UUID) == 0 { - // All devices are unhealthy - log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata) + if skippedXids[e.EventData] { + klog.Infof("Skipping event %+v", e) + continue + } + + klog.Infof("Processing event %+v", e) + eventUUID, ret := e.Device.GetUUID() + if ret != nvml.SUCCESS { + // If we cannot reliably determine the device UUID, we mark all devices as unhealthy. + klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret) for _, d := range devices { unhealthy <- d } continue } - for _, d := range devices { - if d.ID == *e.UUID { - log.Printf("XidCriticalError: Xid=%d on Device=%s, the device will go unhealthy.", e.Edata, d.ID) - unhealthy <- d - } + d, exists := parentToDeviceMap[eventUUID] + if !exists { + klog.Infof("Ignoring event for unexpected device: %v", eventUUID) + continue + } + + klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID) + unhealthy <- d + } +} + +// getAdditionalXids returns a list of additional Xids to skip from the specified string. +// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values +// are ignored. +func getAdditionalXids(input string) []uint64 { + if input == "" { + return nil + } + + var additionalXids []uint64 + for _, additionalXid := range strings.Split(input, ",") { + trimmed := strings.TrimSpace(additionalXid) + if trimmed == "" { + continue + } + xid, err := strconv.ParseUint(trimmed, 10, 64) + if err != nil { + log.Printf("Ignoring malformed Xid value %v: %v", trimmed, err) + continue } + additionalXids = append(additionalXids, xid) } + + return additionalXids } diff --git a/pkg/plugin/nvidia/server.go b/pkg/plugin/nvidia/server.go index c9c83b4c0..ded27e090 100644 --- a/pkg/plugin/nvidia/server.go +++ b/pkg/plugin/nvidia/server.go @@ -27,7 +27,8 @@ import ( "strings" "time" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml" + v1 "k8s.io/api/core/v1" "k8s.io/klog" @@ -57,11 +58,15 @@ type NvidiaDevicePlugin struct { config *apis.Config } +var ( + nvmllib = nvml.New() +) + // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin func NewNvidiaDevicePlugin(config *apis.Config) *NvidiaDevicePlugin { log.Println("Loading NVML") - if err := nvml.Init(); err != nil { - log.Printf("Failed to initialize NVML: %s.", err) + if ret := nvmllib.Init(); ret != nvml.SUCCESS { + log.Printf("Failed to initialize NVML: %v.", ret) log.Printf("If this is a GPU node, did you set the docker default runtime to `nvidia`?") log.Printf("You can check the prerequisites at: https://github.com/volcano-sh/k8s-device-plugin#prerequisites") log.Fatalf("You can learn how to set the runtime at: https://github.com/volcano-sh/k8s-device-plugin#quick-start") diff --git a/pkg/plugin/nvidia/utils.go b/pkg/plugin/nvidia/utils.go index af0cc5a93..338e12735 100644 --- a/pkg/plugin/nvidia/utils.go +++ b/pkg/plugin/nvidia/utils.go @@ -20,15 +20,17 @@ import ( "context" "fmt" "math" + "os" "strconv" "strings" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/prometheus/common/log" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" - "k8s.io/klog" + "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -65,20 +67,46 @@ func GetGPUMemory() uint { // GetDevices returns virtual devices and all physical devices by index. func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) { - n, err := nvml.GetDeviceCount() - check(err) + count, ret := nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get device count: %s", nvmllib.ErrorString(ret)) + } var virtualDevs []*pluginapi.Device deviceByIndex := map[uint]string{} - for i := uint(0); i < n; i++ { - d, err := nvml.NewDevice(i) - check(err) - id := i - deviceByIndex[id] = d.UUID + for i := 0; i < count; i++ { + device, ret := nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get device handle by index %d: %s", i, nvmllib.ErrorString(ret)) + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + klog.Fatalf("Failed to get device uuid: %s", nvmllib.ErrorString(ret)) + } + + id := uint(i) + deviceByIndex[id] = uuid // TODO: Do we assume all cards are of same capacity if GetGPUMemory() == uint(0) { - SetGPUMemory(uint(*d.Memory)) + memory, ret := device.GetMemoryInfo() + if ret != nvml.SUCCESS { + // for dgx-spark GB10 is unified memory https://www.nvidia.com/en-us/products/workstations/dgx-spark/ + if value := os.Getenv("UNIFIED_MEMORY"); value == "true" { + systemMemory, err := GetHostMemory() + if err != nil { + klog.Fatalf("Failed to get host memory err: %v", err) + } + + SetGPUMemory(uint(systemMemory / (1024 * 1024))) // MiB + } else { + klog.Fatalf("Failed to get device memory info: %s", nvmllib.ErrorString(ret)) + } + } else { + SetGPUMemory(uint(memory.Total / (1024 * 1024))) // MiB + } } + for j := uint(0); j < GetGPUMemory()/gpuMemoryFactor; j++ { fakeID := GenerateVirtualDeviceID(id, j) virtualDevs = append(virtualDevs, &pluginapi.Device{