From 243af02bb8f735e43e07b09988a5e4365c8f0201 Mon Sep 17 00:00:00 2001 From: libin18 Date: Fri, 30 Jan 2026 16:52:24 +0800 Subject: [PATCH 1/2] fix: after the machine reboot, vNPU cannot be deleted, resulting in idle resource occupation Signed-off-by: libin18 --- Makefile | 4 +-- README_cn.md | 9 ++++--- cmd/main.go | 31 +++++++++++++++++++---- internal/manager/manager.go | 50 +++++++++++++++++++++++++++++++++++++ internal/server/server.go | 8 ++++++ mind-cluster | 2 +- 6 files changed, 92 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 8dceeed..1aed2dd 100644 --- a/Makefile +++ b/Makefile @@ -14,11 +14,11 @@ docker: --build-arg GOPROXY=https://goproxy.cn,direct \ -t ${IMG_NAME}:${VERSION} . -lint: +lint: $(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 golangci-lint run -ascend-device-plugin: +ascend-device-plugin: $(GO) build $(BUILDARGS) -o ./ascend-device-plugin ./cmd/main.go clean: diff --git a/README_cn.md b/README_cn.md index a4988a4..b00051b 100644 --- a/README_cn.md +++ b/README_cn.md @@ -11,6 +11,7 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H 部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) 克隆子模块 mind-cluster + ```bash git submodule add https://gitcode.com/Ascend/mind-cluster.git ``` @@ -31,7 +32,6 @@ docker buildx build -t $IMAGE_NAME . ### 给 Node 打 ascend 标签 - ``` kubectl label node {ascend-node} ascend=on ``` @@ -48,7 +48,7 @@ kubectl apply -f ascend-device-configmap.yaml kubectl apply -f ascend-device-plugin.yaml ``` -如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend +如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 ## 使用 @@ -67,9 +67,10 @@ kubectl apply -f ascend-device-plugin.yaml # 如果不指定显存大小, 就会使用整张卡 huawei.com/Ascend910B-memory: "4096" ``` + For more examples, see [examples](./examples/) - ### 在 volcano 中使用 +### 在 volcano 中使用 在 volcano 中使用时需要提前部署好 volcano, 更多信息请[参考这里](https://github.com/volcano-sh/volcano/tree/master/docs/user-guide/how_to_use_vnpu.md) @@ -89,4 +90,4 @@ spec: limits: huawei.com/Ascend310P: "1" huawei.com/Ascend310P-memory: "4096" - ``` \ No newline at end of file + ``` diff --git a/cmd/main.go b/cmd/main.go index b233829..5afcb3f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,6 +21,7 @@ import ( "fmt" "os" "syscall" + "time" "github.com/Project-HAMi/HAMi/pkg/util/client" "github.com/Project-HAMi/ascend-device-plugin/internal" @@ -34,9 +35,10 @@ import ( ) var ( - hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") - configFile = flag.String("config_file", "", "config file path") - nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") + hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") + configFile = flag.String("config_file", "", "config file path") + nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") + checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them") ) func checkFlags() { @@ -79,6 +81,26 @@ restart: return err } + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } + cleanupTicker := time.NewTicker(time.Duration(*checkIdleVNPUInterval) * time.Second) + defer cleanupTicker.Stop() + go func() { + for { + select { + case <-cleanupTicker.C: + klog.Info("Running scheduled idle vNPU cleanup") + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } + case <-ps.StopCh(): + klog.Info("Stopping cleanup goroutine") + return + } + } + }() + for { select { //case <-restartTimeout: @@ -138,8 +160,7 @@ func main() { } client.InitGlobalClient() - err = start(server) - if err != nil { + if err = start(server); err != nil { klog.Fatalf("start PluginServer failed, error is %v", err) } } diff --git a/internal/manager/manager.go b/internal/manager/manager.go index d070ea9..0c47a51 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -182,3 +182,53 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { } return unhealthy } + +func (am *AscendManager) CleanupIdleVNPUs() error { + klog.Info("Starting cleanup of idle vNPUs...") + + _, IDs, err := am.mgr.GetDeviceList() + if err != nil { + return fmt.Errorf("failed to get device list: %v", err) + } + klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs) + + totalCleaned := 0 + for _, logicID := range IDs { + cardID, deviceID, err := am.mgr.GetCardIDDeviceID(logicID) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) + continue + } + // 获取该设备上的所有 vNPU 信息 + vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) + if err != nil { + klog.Infof("no vNPU found on device %d or query failed: %v", logicID, err) + continue + } + + klog.V(1).Infof("Device logicID=%d, cardID=%d,deviceID=%d has %d vNPUs", logicID, cardID, deviceID, len(vDevInfos.VDevInfo)) + + for _, vDev := range vDevInfos.VDevInfo { + klog.V(1).Infof("vNPU CardId=%d, VDevID(Vnpu ID)=%d,template=%s,IsContainerUsed=%d", cardID, vDev.VDevID, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + + if vDev.QueryInfo.IsContainerUsed == 0 { + klog.V(1).Infof("Found idle vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d", + cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + + err := am.mgr.DestroyVirtualDevice(logicID, uint32(vDev.VDevID)) + if err != nil { + klog.Errorf("failed to destroy vNPU %d on device %d: %v", vDev.VDevID, logicID, err) + } else { + klog.Infof("Successfully destroyed idle vNPU: vnpuID=%d", vDev.VDevID) + totalCleaned++ + } + } else { + klog.Infof("Skipping active vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d", + cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + } + } + } + + klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned) + return nil +} diff --git a/internal/server/server.go b/internal/server/server.go index 22f0927..ed8477f 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -101,6 +101,14 @@ func (ps *PluginServer) Stop() error { return nil } +func (ps *PluginServer) StopCh() <-chan interface{} { + return ps.stopCh +} + +func (ps *PluginServer) CleanupIdleVNPUs() error { + return ps.mgr.CleanupIdleVNPUs() +} + func (ps *PluginServer) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() diff --git a/mind-cluster b/mind-cluster index c9cf42d..2ee74e4 160000 --- a/mind-cluster +++ b/mind-cluster @@ -1 +1 @@ -Subproject commit c9cf42da06680ea6f825e4d312d0b5929923f482 +Subproject commit 2ee74e4d2b5eff56e7dd1e2c753bc86b2c70d25e From 09ab29d1934597254150b6cb2537bce59a3cd7a8 Mon Sep 17 00:00:00 2001 From: libin18 Date: Tue, 3 Feb 2026 23:08:14 +0800 Subject: [PATCH 2/2] refactor: translate Chinese into English,and adjust the clean idle vnpus code Signed-off-by: libin18 --- cmd/main.go | 26 +++-------------- internal/manager/manager.go | 2 +- internal/server/server.go | 58 +++++++++++++++++++++++++------------ 3 files changed, 44 insertions(+), 42 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 5afcb3f..99b4f3e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,7 +21,6 @@ import ( "fmt" "os" "syscall" - "time" "github.com/Project-HAMi/HAMi/pkg/util/client" "github.com/Project-HAMi/ascend-device-plugin/internal" @@ -74,6 +73,9 @@ restart: } } restarting = true + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } klog.Info("Starting Plugins.") err = ps.Start() if err != nil { @@ -81,26 +83,6 @@ restart: return err } - if err := ps.CleanupIdleVNPUs(); err != nil { - klog.Errorf("Failed to cleanup idle vNPUs: %v", err) - } - cleanupTicker := time.NewTicker(time.Duration(*checkIdleVNPUInterval) * time.Second) - defer cleanupTicker.Stop() - go func() { - for { - select { - case <-cleanupTicker.C: - klog.Info("Running scheduled idle vNPU cleanup") - if err := ps.CleanupIdleVNPUs(); err != nil { - klog.Errorf("Failed to cleanup idle vNPUs: %v", err) - } - case <-ps.StopCh(): - klog.Info("Stopping cleanup goroutine") - return - } - } - }() - for { select { //case <-restartTimeout: @@ -154,7 +136,7 @@ func main() { if err != nil { klog.Fatalf("load config failed, error is %v", err) } - server, err := server.NewPluginServer(mgr, *nodeName) + server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval) if err != nil { klog.Fatalf("init PluginServer failed, error is %v", err) } diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 0c47a51..19222f7 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -199,7 +199,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error { klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) continue } - // 获取该设备上的所有 vNPU 信息 + // Obtain all vNPU information on this device vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) if err != nil { klog.Infof("no vNPU found on device %d or query failed: %v", logicID, err) diff --git a/internal/server/server.go b/internal/server/server.go index ed8477f..8fefc9e 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -52,28 +52,30 @@ var ( ) type PluginServer struct { - nodeName string - registerAnno string - handshakeAnno string - allocAnno string - grpcServer *grpc.Server - mgr *manager.AscendManager - socket string - stopCh chan interface{} - healthCh chan int32 + nodeName string + registerAnno string + handshakeAnno string + allocAnno string + grpcServer *grpc.Server + mgr *manager.AscendManager + socket string + stopCh chan interface{} + healthCh chan int32 + checkIdleVNPUInterval int } -func NewPluginServer(mgr *manager.AscendManager, nodeName string) (*PluginServer, error) { +func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { return &PluginServer{ - nodeName: nodeName, - registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), - handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), - allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), - grpcServer: grpc.NewServer(), - mgr: mgr, - socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), - stopCh: make(chan interface{}), - healthCh: make(chan int32), + nodeName: nodeName, + registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), + handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), + allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), + grpcServer: grpc.NewServer(), + mgr: mgr, + socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), + stopCh: make(chan interface{}), + healthCh: make(chan int32), + checkIdleVNPUInterval: checkIdleVNPUInterval, }, nil } @@ -91,10 +93,28 @@ func (ps *PluginServer) Start() error { if err != nil { return err } + go ps.startPeriodicCheckIdleVNPUs() go ps.watchAndRegister() return nil } +func (ps *PluginServer) startPeriodicCheckIdleVNPUs() { + ticker := time.NewTicker(time.Duration(ps.checkIdleVNPUInterval) * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + klog.Info("Running scheduled idle vNPU cleanup") + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } + case <-ps.stopCh: + klog.Info("Stopping cleanup goroutine") + return + } + } +} + func (ps *PluginServer) Stop() error { close(ps.stopCh) ps.grpcServer.Stop()