From a17687865dea1a690ecf4d2f52451b047c6d02a3 Mon Sep 17 00:00:00 2001 From: james Date: Thu, 4 Sep 2025 11:33:36 +0800 Subject: [PATCH 01/12] feat: add numa info Signed-off-by: james --- internal/server/server.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/server/server.go b/internal/server/server.go index e67ff2a..d120030 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -188,11 +188,22 @@ func (ps *PluginServer) registerKubelet() error { return nil } +func (ps *PluginServer) getNumaInformation(idx int) (int, error) { + if idx > 3 { + return 1, nil + } + return 0, nil +} + func (ps *PluginServer) registerHAMi() error { devs := ps.mgr.GetDevices() apiDevices := make([]*util.DeviceInfo, 0, len(devs)) // hami currently believes that the index starts from 0 and is continuous. for i, dev := range devs { + numa, err := ps.getNumaInformation(i) + if err != nil { + return fmt.Errorf("get numa information error: %v", err) + } apiDevices = append(apiDevices, &util.DeviceInfo{ Index: uint(i), ID: dev.UUID, @@ -200,7 +211,7 @@ func (ps *PluginServer) registerHAMi() error { Devmem: int32(dev.Memory), Devcore: dev.AICore, Type: ps.mgr.CommonWord(), - Numa: 0, + Numa: numa, Health: dev.Health, }) } From 4e87ade6a5265cc37439e0617a4bc79bb16f8ba0 Mon Sep 17 00:00:00 2001 From: james Date: Wed, 10 Sep 2025 17:22:34 +0800 Subject: [PATCH 02/12] feat: add networkID Signed-off-by: james --- internal/server/server.go | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index d120030..013faf9 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -23,6 +23,7 @@ import ( "net" "os" "path" + "strings" "time" "github.com/Project-HAMi/HAMi/pkg/device/ascend" @@ -40,7 +41,8 @@ import ( const ( // RegisterAnnos = "hami.io/node-register-ascend" // PodAllocAnno = "huawei.com/AscendDevices" - NodeLockAscend = "hami.io/mutex.lock" + NodeLockAscend = "hami.io/mutex.lock" + Ascend910Prefix = "Ascend910" ) var ( @@ -188,7 +190,7 @@ func (ps *PluginServer) registerKubelet() error { return nil } -func (ps *PluginServer) getNumaInformation(idx int) (int, error) { +func (ps *PluginServer) getDeviceNetworkID(idx int) (int, error) { if idx > 3 { return 1, nil } @@ -200,20 +202,26 @@ func (ps *PluginServer) registerHAMi() error { apiDevices := make([]*util.DeviceInfo, 0, len(devs)) // hami currently believes that the index starts from 0 and is continuous. for i, dev := range devs { - numa, err := ps.getNumaInformation(i) - if err != nil { - return fmt.Errorf("get numa information error: %v", err) - } - apiDevices = append(apiDevices, &util.DeviceInfo{ + device := &util.DeviceInfo{ Index: uint(i), ID: dev.UUID, Count: int32(ps.mgr.VDeviceCount()), Devmem: int32(dev.Memory), Devcore: dev.AICore, Type: ps.mgr.CommonWord(), - Numa: numa, + Numa: 0, Health: dev.Health, - }) + } + if strings.HasPrefix(device.Type, Ascend910Prefix) { + networkID, err := ps.getDeviceNetworkID(i) + if err != nil { + return fmt.Errorf("get networkID error: %v", err) + } + device.CustomInfo = map[string]any{ + "networkID": networkID, + } + } + apiDevices = append(apiDevices, device) } annos := make(map[string]string) annos[ps.registerAnno] = util.MarshalNodeDevices(apiDevices) From 0f1158136db8456d23c89a0ca862546970cfa605 Mon Sep 17 00:00:00 2001 From: james Date: Wed, 10 Sep 2025 17:25:58 +0800 Subject: [PATCH 03/12] refactor: rename the key of CustomInfo Signed-off-by: james --- internal/server/server.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index 013faf9..206392b 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -213,12 +213,12 @@ func (ps *PluginServer) registerHAMi() error { Health: dev.Health, } if strings.HasPrefix(device.Type, Ascend910Prefix) { - networkID, err := ps.getDeviceNetworkID(i) + NetworkID, err := ps.getDeviceNetworkID(i) if err != nil { return fmt.Errorf("get networkID error: %v", err) } device.CustomInfo = map[string]any{ - "networkID": networkID, + "NetworkID": NetworkID, } } apiDevices = append(apiDevices, device) From fa7b81da9ffcf9833e9d528e78641ad8a71c8a07 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 12 Sep 2025 12:50:32 +0800 Subject: [PATCH 04/12] fix: update hami commit Signed-off-by: james --- go.mod | 2 +- internal/server/server.go | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index d81d606..86c9b48 100644 --- a/go.mod +++ b/go.mod @@ -57,6 +57,6 @@ require ( ) replace ( - github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6 + github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 ) diff --git a/internal/server/server.go b/internal/server/server.go index 206392b..73b49ed 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -26,6 +26,7 @@ import ( "strings" "time" + "github.com/Project-HAMi/HAMi/pkg/device" "github.com/Project-HAMi/HAMi/pkg/device/ascend" "github.com/Project-HAMi/HAMi/pkg/util" "github.com/Project-HAMi/HAMi/pkg/util/nodelock" @@ -199,10 +200,10 @@ func (ps *PluginServer) getDeviceNetworkID(idx int) (int, error) { func (ps *PluginServer) registerHAMi() error { devs := ps.mgr.GetDevices() - apiDevices := make([]*util.DeviceInfo, 0, len(devs)) + apiDevices := make([]*device.DeviceInfo, 0, len(devs)) // hami currently believes that the index starts from 0 and is continuous. for i, dev := range devs { - device := &util.DeviceInfo{ + device := &device.DeviceInfo{ Index: uint(i), ID: dev.UUID, Count: int32(ps.mgr.VDeviceCount()), @@ -224,7 +225,7 @@ func (ps *PluginServer) registerHAMi() error { apiDevices = append(apiDevices, device) } annos := make(map[string]string) - annos[ps.registerAnno] = util.MarshalNodeDevices(apiDevices) + annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices) annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05") node, err := util.GetNode(ps.nodeName) if err != nil { From d6bdfa478fa8b4ff2f8520b6a5c2caf3a76dee0e Mon Sep 17 00:00:00 2001 From: james Date: Fri, 12 Sep 2025 15:49:19 +0800 Subject: [PATCH 05/12] fix: update ID Signed-off-by: james --- internal/server/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/server/server.go b/internal/server/server.go index 73b49ed..630e2ae 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -205,7 +205,7 @@ func (ps *PluginServer) registerHAMi() error { for i, dev := range devs { device := &device.DeviceInfo{ Index: uint(i), - ID: dev.UUID, + ID: ps.nodeName + "-" + dev.UUID, Count: int32(ps.mgr.VDeviceCount()), Devmem: int32(dev.Memory), Devcore: dev.AICore, From afa4e2cd209366b2a5a5637b59fafab136f5ffef Mon Sep 17 00:00:00 2001 From: james Date: Tue, 16 Sep 2025 19:36:20 +0800 Subject: [PATCH 06/12] fix: client null Signed-off-by: james --- cmd/main.go | 2 ++ config.yaml | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/cmd/main.go b/cmd/main.go index 72aed00..4885fff 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -27,6 +27,7 @@ import ( "github.com/Project-HAMi/ascend-device-plugin/internal/manager" "github.com/Project-HAMi/ascend-device-plugin/internal/server" "github.com/Project-HAMi/ascend-device-plugin/version" + "github.com/Project-HAMi/HAMi/pkg/util/client" "github.com/fsnotify/fsnotify" "huawei.com/npu-exporter/v6/common-utils/hwlog" "k8s.io/klog/v2" @@ -136,6 +137,7 @@ func main() { if err != nil { klog.Fatalf("init PluginServer failed, error is %v", err) } + client.InitGlobalClient() err = start(server) if err != nil { diff --git a/config.yaml b/config.yaml index 26de5bd..945e692 100644 --- a/config.yaml +++ b/config.yaml @@ -57,3 +57,23 @@ vnpus: memory: 12288 aiCore: 4 aiCPU: 4 +- chipName: 910ProB + commonWord: Ascend910ProB + resourceName: huawei.com/Ascend910ProB + resourceMemoryName: huawei.com/Ascend910ProB-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 30 + templates: + - name: vir02 + memory: 2184 + aiCore: 2 + - name: vir04 + memory: 4369 + aiCore: 4 + - name: vir08 + memory: 8738 + aiCore: 8 + - name: vir16 + memory: 17476 + aiCore: 16 \ No newline at end of file From e63183b1bea77104ca77b715723ded7d72ed9ad1 Mon Sep 17 00:00:00 2001 From: james Date: Thu, 18 Sep 2025 16:23:57 +0800 Subject: [PATCH 07/12] refactor: revert ID Signed-off-by: james --- internal/server/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/server/server.go b/internal/server/server.go index 630e2ae..73b49ed 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -205,7 +205,7 @@ func (ps *PluginServer) registerHAMi() error { for i, dev := range devs { device := &device.DeviceInfo{ Index: uint(i), - ID: ps.nodeName + "-" + dev.UUID, + ID: dev.UUID, Count: int32(ps.mgr.VDeviceCount()), Devmem: int32(dev.Memory), Devcore: dev.AICore, From 157198dd2ed9402adce54d052c2f2cb91d08af68 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 19 Sep 2025 11:46:23 +0800 Subject: [PATCH 08/12] fix: fix style Signed-off-by: james --- cmd/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/main.go b/cmd/main.go index 4885fff..c77c244 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -23,11 +23,11 @@ import ( "os" "syscall" + "github.com/Project-HAMi/HAMi/pkg/util/client" "github.com/Project-HAMi/ascend-device-plugin/internal" "github.com/Project-HAMi/ascend-device-plugin/internal/manager" "github.com/Project-HAMi/ascend-device-plugin/internal/server" "github.com/Project-HAMi/ascend-device-plugin/version" - "github.com/Project-HAMi/HAMi/pkg/util/client" "github.com/fsnotify/fsnotify" "huawei.com/npu-exporter/v6/common-utils/hwlog" "k8s.io/klog/v2" From dd663655e2c7f526e6c4123d345faa00ae988f30 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 19 Sep 2025 13:59:06 +0800 Subject: [PATCH 09/12] fix: fix workflow Signed-off-by: james --- .github/workflows/dev.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 48d5d40..cec60b0 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -3,9 +3,7 @@ name: CI -on: - pull_request: - branches: [ "main" ] +on: [push, pull_request] env: GO_VERSION: "1.22.5" @@ -19,6 +17,7 @@ jobs: - uses: actions/setup-go@v5 with: go-version: ${{ env.GO_VERSION }} + - run: go mod tidy - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: From c250a2126d1debcc1cf2719c1533e294a074b3e1 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 19 Sep 2025 14:05:15 +0800 Subject: [PATCH 10/12] update Signed-off-by: james --- .github/workflows/dev.yml | 2 +- go.mod | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index cec60b0..bf8645c 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -6,7 +6,7 @@ name: CI on: [push, pull_request] env: - GO_VERSION: "1.22.5" + GO_VERSION: "1.24.6" jobs: golangci: diff --git a/go.mod b/go.mod index 86c9b48..20d9d05 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/Project-HAMi/ascend-device-plugin -go 1.22.2 +go 1.24.6 require ( github.com/Project-HAMi/HAMi v0.0.0 From 66c8c965f7cc15985c1f6596e53ff4a9c7d7757c Mon Sep 17 00:00:00 2001 From: james Date: Fri, 19 Sep 2025 14:07:47 +0800 Subject: [PATCH 11/12] update Signed-off-by: james --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index bf8645c..1c14626 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -21,7 +21,7 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.60 + version: v1.61.0 build: env: From 0e39de524eca745b3359be1d263838ea27a2cbd5 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 19 Sep 2025 14:09:35 +0800 Subject: [PATCH 12/12] update Signed-off-by: james --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 1c14626..342cf98 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -21,7 +21,7 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.61.0 + version: latest build: env: