diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index c533000..919a1ca 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -24,6 +24,8 @@ jobs: go-version: ${{ env.GO_VERSION }} - run: go mod download github.com/Project-HAMi/HAMi - run: go get github.com/Project-HAMi/ascend-device-plugin/internal/server + - run: go get huawei.com/npu-exporter + - run: go get huawei.com/npu-exporter/utils/logger@v0.0.0-00010101000000-000000000000 - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..eaa5629 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "mind-cluster"] + path = mind-cluster + url = https://gitcode.com/Ascend/mind-cluster.git diff --git a/Dockerfile b/Dockerfile index b99381f..634337e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ WORKDIR /build ADD . . RUN go mod download github.com/Project-HAMi/HAMi RUN go get github.com/Project-HAMi/ascend-device-plugin/internal/server +RUN go get huawei.com/npu-exporter +RUN go get huawei.com/npu-exporter/utils/logger@v0.0.0-00010101000000-000000000000 RUN make all FROM $BASE_IMAGE diff --git a/README.md b/README.md index 2523e59..aea0e91 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ Memory slicing is supported based on virtualization template, lease available te [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## Compile ```bash diff --git a/README_cn.md b/README_cn.md index 156ca53..a4988a4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -10,6 +10,11 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H 部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) +克隆子模块 mind-cluster +```bash +git submodule add https://gitcode.com/Ascend/mind-cluster.git +``` + ## 编译 ```bash diff --git a/cmd/main.go b/cmd/main.go index c77c244..b233829 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,7 +17,6 @@ package main import ( - "context" "flag" "fmt" "os" @@ -29,7 +28,7 @@ import ( "github.com/Project-HAMi/ascend-device-plugin/internal/server" "github.com/Project-HAMi/ascend-device-plugin/version" "github.com/fsnotify/fsnotify" - "huawei.com/npu-exporter/v6/common-utils/hwlog" + "huawei.com/npu-exporter/utils/logger" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -117,11 +116,11 @@ func main() { checkFlags() klog.Infof("version: %s", version.GetVersion()) klog.Infof("using config file: %s", *configFile) - config := &hwlog.LogConfig{ - OnlyToStdout: true, - LogLevel: *hwLoglevel, - } - err := hwlog.InitRunLogger(config, context.Background()) + + logger.HwLogConfig.OnlyToStdout = true + logger.HwLogConfig.LogLevel = *hwLoglevel + + err := logger.InitLogger("Prometheus") if err != nil { klog.Fatalf("init huawei run logger failed, %v", err) } diff --git a/go.mod b/go.mod index 86c9b48..d04534c 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,10 @@ module github.com/Project-HAMi/ascend-device-plugin go 1.22.2 require ( + ascend-common v0.0.0 github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.7.0 google.golang.org/grpc v1.63.2 - huawei.com/npu-exporter/v6 v6.0.0-RC3.b001 k8s.io/api v0.29.3 k8s.io/apimachinery v0.29.3 k8s.io/klog/v2 v2.120.1 @@ -57,6 +57,7 @@ require ( ) replace ( + ascend-common => ./mind-cluster/component/ascend-common github.com/Project-HAMi/HAMi v0.0.0 => github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 - huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 + huawei.com/npu-exporter => ./mind-cluster/component/npu-exporter ) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index db92bd8..d070ea9 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -20,9 +20,10 @@ import ( "fmt" "sort" + "ascend-common/devmanager" + "ascend-common/devmanager/dcmi" + "github.com/Project-HAMi/ascend-device-plugin/internal" - "huawei.com/npu-exporter/v6/devmanager" - "huawei.com/npu-exporter/v6/devmanager/dcmi" "k8s.io/klog/v2" ) @@ -45,7 +46,7 @@ type AscendManager struct { } func NewAscendManager() (*AscendManager, error) { - mgr, err := devmanager.AutoInit("") + mgr, err := devmanager.AutoInit("", 30) if err != nil { return nil, err } diff --git a/internal/server/server.go b/internal/server/server.go index 67dcb8d..22f0927 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -44,6 +44,7 @@ const ( // PodAllocAnno = "huawei.com/AscendDevices" NodeLockAscend = "hami.io/mutex.lock" Ascend910Prefix = "Ascend910" + Ascend910CType = "Ascend910C" ) var ( @@ -191,10 +192,16 @@ func (ps *PluginServer) registerKubelet() error { return nil } -func (ps *PluginServer) getDeviceNetworkID(idx int) (int, error) { +func (ps *PluginServer) getDeviceNetworkID(idx int, deviceType string) (int, error) { + // For Ascend910C devices, all modules (dies) are interconnected via HCCS + if deviceType == Ascend910CType { + return 0, nil + } + if idx > 3 { return 1, nil } + return 0, nil } @@ -214,7 +221,7 @@ func (ps *PluginServer) registerHAMi() error { Health: dev.Health, } if strings.HasPrefix(device.Type, Ascend910Prefix) { - NetworkID, err := ps.getDeviceNetworkID(i) + NetworkID, err := ps.getDeviceNetworkID(i, device.Type) if err != nil { return fmt.Errorf("get networkID error: %v", err) } diff --git a/mind-cluster b/mind-cluster new file mode 160000 index 0000000..c9cf42d --- /dev/null +++ b/mind-cluster @@ -0,0 +1 @@ +Subproject commit c9cf42da06680ea6f825e4d312d0b5929923f482