diff --git a/Makefile b/Makefile index 8dceeed..1aed2dd 100644 --- a/Makefile +++ b/Makefile @@ -14,11 +14,11 @@ docker: --build-arg GOPROXY=https://goproxy.cn,direct \ -t ${IMG_NAME}:${VERSION} . -lint: +lint: $(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 golangci-lint run -ascend-device-plugin: +ascend-device-plugin: $(GO) build $(BUILDARGS) -o ./ascend-device-plugin ./cmd/main.go clean: diff --git a/README_cn.md b/README_cn.md index a4988a4..b00051b 100644 --- a/README_cn.md +++ b/README_cn.md @@ -11,6 +11,7 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H 部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) 克隆子模块 mind-cluster + ```bash git submodule add https://gitcode.com/Ascend/mind-cluster.git ``` @@ -31,7 +32,6 @@ docker buildx build -t $IMAGE_NAME . ### 给 Node 打 ascend 标签 - ``` kubectl label node {ascend-node} ascend=on ``` @@ -48,7 +48,7 @@ kubectl apply -f ascend-device-configmap.yaml kubectl apply -f ascend-device-plugin.yaml ``` -如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend +如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 ## 使用 @@ -67,9 +67,10 @@ kubectl apply -f ascend-device-plugin.yaml # 如果不指定显存大小, 就会使用整张卡 huawei.com/Ascend910B-memory: "4096" ``` + For more examples, see [examples](./examples/) - ### 在 volcano 中使用 +### 在 volcano 中使用 在 volcano 中使用时需要提前部署好 volcano, 更多信息请[参考这里](https://github.com/volcano-sh/volcano/tree/master/docs/user-guide/how_to_use_vnpu.md) @@ -89,4 +90,4 @@ spec: limits: huawei.com/Ascend310P: "1" huawei.com/Ascend310P-memory: "4096" - ``` \ No newline at end of file + ``` diff --git a/cmd/main.go b/cmd/main.go index b233829..99b4f3e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,9 +34,10 @@ import ( ) var ( - hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") - configFile = flag.String("config_file", "", "config file path") - nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") + hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") + configFile = flag.String("config_file", "", "config file path") + nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") + checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them") ) func checkFlags() { @@ -72,6 +73,9 @@ restart: } } restarting = true + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } klog.Info("Starting Plugins.") err = ps.Start() if err != nil { @@ -132,14 +136,13 @@ func main() { if err != nil { klog.Fatalf("load config failed, error is %v", err) } - server, err := server.NewPluginServer(mgr, *nodeName) + server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval) if err != nil { klog.Fatalf("init PluginServer failed, error is %v", err) } client.InitGlobalClient() - err = start(server) - if err != nil { + if err = start(server); err != nil { klog.Fatalf("start PluginServer failed, error is %v", err) } } diff --git a/internal/manager/manager.go b/internal/manager/manager.go index d070ea9..19222f7 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -182,3 +182,53 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { } return unhealthy } + +func (am *AscendManager) CleanupIdleVNPUs() error { + klog.Info("Starting cleanup of idle vNPUs...") + + _, IDs, err := am.mgr.GetDeviceList() + if err != nil { + return fmt.Errorf("failed to get device list: %v", err) + } + klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs) + + totalCleaned := 0 + for _, logicID := range IDs { + cardID, deviceID, err := am.mgr.GetCardIDDeviceID(logicID) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) + continue + } + // Obtain all vNPU information on this device + vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) + if err != nil { + klog.Infof("no vNPU found on device %d or query failed: %v", logicID, err) + continue + } + + klog.V(1).Infof("Device logicID=%d, cardID=%d,deviceID=%d has %d vNPUs", logicID, cardID, deviceID, len(vDevInfos.VDevInfo)) + + for _, vDev := range vDevInfos.VDevInfo { + klog.V(1).Infof("vNPU CardId=%d, VDevID(Vnpu ID)=%d,template=%s,IsContainerUsed=%d", cardID, vDev.VDevID, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + + if vDev.QueryInfo.IsContainerUsed == 0 { + klog.V(1).Infof("Found idle vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d", + cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + + err := am.mgr.DestroyVirtualDevice(logicID, uint32(vDev.VDevID)) + if err != nil { + klog.Errorf("failed to destroy vNPU %d on device %d: %v", vDev.VDevID, logicID, err) + } else { + klog.Infof("Successfully destroyed idle vNPU: vnpuID=%d", vDev.VDevID) + totalCleaned++ + } + } else { + klog.Infof("Skipping active vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d", + cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed) + } + } + } + + klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned) + return nil +} diff --git a/internal/server/server.go b/internal/server/server.go index 22f0927..8fefc9e 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -52,28 +52,30 @@ var ( ) type PluginServer struct { - nodeName string - registerAnno string - handshakeAnno string - allocAnno string - grpcServer *grpc.Server - mgr *manager.AscendManager - socket string - stopCh chan interface{} - healthCh chan int32 + nodeName string + registerAnno string + handshakeAnno string + allocAnno string + grpcServer *grpc.Server + mgr *manager.AscendManager + socket string + stopCh chan interface{} + healthCh chan int32 + checkIdleVNPUInterval int } -func NewPluginServer(mgr *manager.AscendManager, nodeName string) (*PluginServer, error) { +func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { return &PluginServer{ - nodeName: nodeName, - registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), - handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), - allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), - grpcServer: grpc.NewServer(), - mgr: mgr, - socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), - stopCh: make(chan interface{}), - healthCh: make(chan int32), + nodeName: nodeName, + registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), + handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), + allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), + grpcServer: grpc.NewServer(), + mgr: mgr, + socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), + stopCh: make(chan interface{}), + healthCh: make(chan int32), + checkIdleVNPUInterval: checkIdleVNPUInterval, }, nil } @@ -91,16 +93,42 @@ func (ps *PluginServer) Start() error { if err != nil { return err } + go ps.startPeriodicCheckIdleVNPUs() go ps.watchAndRegister() return nil } +func (ps *PluginServer) startPeriodicCheckIdleVNPUs() { + ticker := time.NewTicker(time.Duration(ps.checkIdleVNPUInterval) * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + klog.Info("Running scheduled idle vNPU cleanup") + if err := ps.CleanupIdleVNPUs(); err != nil { + klog.Errorf("Failed to cleanup idle vNPUs: %v", err) + } + case <-ps.stopCh: + klog.Info("Stopping cleanup goroutine") + return + } + } +} + func (ps *PluginServer) Stop() error { close(ps.stopCh) ps.grpcServer.Stop() return nil } +func (ps *PluginServer) StopCh() <-chan interface{} { + return ps.stopCh +} + +func (ps *PluginServer) CleanupIdleVNPUs() error { + return ps.mgr.CleanupIdleVNPUs() +} + func (ps *PluginServer) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() diff --git a/mind-cluster b/mind-cluster index c9cf42d..2ee74e4 160000 --- a/mind-cluster +++ b/mind-cluster @@ -1 +1 @@ -Subproject commit c9cf42da06680ea6f825e4d312d0b5929923f482 +Subproject commit 2ee74e4d2b5eff56e7dd1e2c753bc86b2c70d25e