Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ docker:
--build-arg GOPROXY=https://goproxy.cn,direct \
-t ${IMG_NAME}:${VERSION} .

lint:
lint:
$(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0
golangci-lint run

ascend-device-plugin:
ascend-device-plugin:
$(GO) build $(BUILDARGS) -o ./ascend-device-plugin ./cmd/main.go

clean:
Expand Down
9 changes: 5 additions & 4 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H
部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime)

克隆子模块 mind-cluster

```bash
git submodule add https://gitcode.com/Ascend/mind-cluster.git
```
Expand All @@ -31,7 +32,6 @@ docker buildx build -t $IMAGE_NAME .

### 给 Node 打 ascend 标签


```
kubectl label node {ascend-node} ascend=on
```
Expand All @@ -48,7 +48,7 @@ kubectl apply -f ascend-device-configmap.yaml
kubectl apply -f ascend-device-plugin.yaml
```

如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend
如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 <https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend>

## 使用

Expand All @@ -67,9 +67,10 @@ kubectl apply -f ascend-device-plugin.yaml
# 如果不指定显存大小, 就会使用整张卡
huawei.com/Ascend910B-memory: "4096"
```

For more examples, see [examples](./examples/)

### 在 volcano 中使用
### 在 volcano 中使用

在 volcano 中使用时需要提前部署好 volcano, 更多信息请[参考这里](https://github.com/volcano-sh/volcano/tree/master/docs/user-guide/how_to_use_vnpu.md)

Expand All @@ -89,4 +90,4 @@ spec:
limits:
huawei.com/Ascend310P: "1"
huawei.com/Ascend310P-memory: "4096"
```
```
15 changes: 9 additions & 6 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ import (
)

var (
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
configFile = flag.String("config_file", "", "config file path")
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
configFile = flag.String("config_file", "", "config file path")
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
)

func checkFlags() {
Expand Down Expand Up @@ -72,6 +73,9 @@ restart:
}
}
restarting = true
if err := ps.CleanupIdleVNPUs(); err != nil {
klog.Errorf("Failed to cleanup idle vNPUs: %v", err)
}
klog.Info("Starting Plugins.")
err = ps.Start()
if err != nil {
Expand Down Expand Up @@ -132,14 +136,13 @@ func main() {
if err != nil {
klog.Fatalf("load config failed, error is %v", err)
}
server, err := server.NewPluginServer(mgr, *nodeName)
server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval)
if err != nil {
klog.Fatalf("init PluginServer failed, error is %v", err)
}
client.InitGlobalClient()

err = start(server)
if err != nil {
if err = start(server); err != nil {
klog.Fatalf("start PluginServer failed, error is %v", err)
}
}
50 changes: 50 additions & 0 deletions internal/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,53 @@ func (am *AscendManager) GetUnHealthIDs() []int32 {
}
return unhealthy
}

func (am *AscendManager) CleanupIdleVNPUs() error {
klog.Info("Starting cleanup of idle vNPUs...")

_, IDs, err := am.mgr.GetDeviceList()
if err != nil {
return fmt.Errorf("failed to get device list: %v", err)
}
klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs)

totalCleaned := 0
for _, logicID := range IDs {
cardID, deviceID, err := am.mgr.GetCardIDDeviceID(logicID)
if err != nil {
klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err)
continue
}
// Obtain all vNPU information on this device
vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID)
if err != nil {
klog.Infof("no vNPU found on device %d or query failed: %v", logicID, err)
continue
}

klog.V(1).Infof("Device logicID=%d, cardID=%d,deviceID=%d has %d vNPUs", logicID, cardID, deviceID, len(vDevInfos.VDevInfo))

for _, vDev := range vDevInfos.VDevInfo {
klog.V(1).Infof("vNPU CardId=%d, VDevID(Vnpu ID)=%d,template=%s,IsContainerUsed=%d", cardID, vDev.VDevID, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)

if vDev.QueryInfo.IsContainerUsed == 0 {
klog.V(1).Infof("Found idle vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d",
cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)

err := am.mgr.DestroyVirtualDevice(logicID, uint32(vDev.VDevID))
if err != nil {
klog.Errorf("failed to destroy vNPU %d on device %d: %v", vDev.VDevID, logicID, err)
} else {
klog.Infof("Successfully destroyed idle vNPU: vnpuID=%d", vDev.VDevID)
totalCleaned++
}
} else {
klog.Infof("Skipping active vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d",
cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)
}
}
}

klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned)
return nil
}
66 changes: 47 additions & 19 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,28 +52,30 @@ var (
)

type PluginServer struct {
nodeName string
registerAnno string
handshakeAnno string
allocAnno string
grpcServer *grpc.Server
mgr *manager.AscendManager
socket string
stopCh chan interface{}
healthCh chan int32
nodeName string
registerAnno string
handshakeAnno string
allocAnno string
grpcServer *grpc.Server
mgr *manager.AscendManager
socket string
stopCh chan interface{}
healthCh chan int32
checkIdleVNPUInterval int
}

func NewPluginServer(mgr *manager.AscendManager, nodeName string) (*PluginServer, error) {
func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) {
return &PluginServer{
nodeName: nodeName,
registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()),
handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()),
allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()),
grpcServer: grpc.NewServer(),
mgr: mgr,
socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())),
stopCh: make(chan interface{}),
healthCh: make(chan int32),
nodeName: nodeName,
registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()),
handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()),
allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()),
grpcServer: grpc.NewServer(),
mgr: mgr,
socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())),
stopCh: make(chan interface{}),
healthCh: make(chan int32),
checkIdleVNPUInterval: checkIdleVNPUInterval,
}, nil
}

Expand All @@ -91,16 +93,42 @@ func (ps *PluginServer) Start() error {
if err != nil {
return err
}
go ps.startPeriodicCheckIdleVNPUs()
go ps.watchAndRegister()
return nil
}

func (ps *PluginServer) startPeriodicCheckIdleVNPUs() {
ticker := time.NewTicker(time.Duration(ps.checkIdleVNPUInterval) * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
klog.Info("Running scheduled idle vNPU cleanup")
if err := ps.CleanupIdleVNPUs(); err != nil {
klog.Errorf("Failed to cleanup idle vNPUs: %v", err)
}
case <-ps.stopCh:
klog.Info("Stopping cleanup goroutine")
return
}
}
}

func (ps *PluginServer) Stop() error {
close(ps.stopCh)
ps.grpcServer.Stop()
return nil
}

func (ps *PluginServer) StopCh() <-chan interface{} {
return ps.stopCh
}

func (ps *PluginServer) CleanupIdleVNPUs() error {
return ps.mgr.CleanupIdleVNPUs()
}

func (ps *PluginServer) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
Expand Down
2 changes: 1 addition & 1 deletion mind-cluster
Submodule mind-cluster updated from c9cf42 to 2ee74e